Fig 6A

##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")
####This section uses TCGA cohort..
#Download mutations (maf) data from: https://api.gdc.cancer.gov/data/1c8cfe5f-e52d-41ba-94da-f15ea1337efc 
#Download RNA seq data from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE62944
#Download CNV data from: http://api.gdc.cancer.gov/data/00a32f7a-c85f-4f86-850d-be53973cbc4d
#download clinical data from: http://api.gdc.cancer.gov/data/00a32f7a-c85f-4f86-850d-be53973cbc4d
#Download predicted neoepitopes from https://www.cell.com/cms/10.1016/j.cell.2014.12.033/attachment/603562c6-566b-4891-b940-7134b9f7367b/mmc3.txt
#Download HLA haplotypes from; https://cancerimmunolres.aacrjournals.org/highwire/filestream/38473/field_highwire_adjunct_files/1/224371_2_supp_5920266_q2tjr3.xlsx

##########
############RNA-seq data analysis..
#
#Load RNA data as counts, this is only for tumor data
RNA_counts_pan<-/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/RNA_seq/GSE62944_RAW/GSM1536837_06_01_15_TCGA_24.tumor_Rsubread_FeatureCounts.txt.gz
dim(RNA_counts_pan)
#Load cancer type correlation data with TCGA bar codes
bar_codes_pancancer <- read.delim("/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/RNA_seq/GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt", header=FALSE)
colnames(bar_codes_pancancer)<-c("sample","condition")#change column names
table(bar_codes_pancancer$condition)
# using a TCGAutils function
library(TCGAutils)
   #TCGA codes
bar_codes_pancancer$sample <- as.character(bar_codes_pancancer$sample) # TCGAutils needs to be as character
bar_codes_pancancer$bcr_patient_barcode <- TCGAbarcode(bar_codes_pancancer$sample, participant = TRUE) # takes out some data from xbarcode that is not needed
##Put names in the same order for the count and colData
 
  design_deseq2<-bar_codes_pancancer
  design_deseq2$sample<-gsub("-",".",design_deseq2$sample)
  

all(colnames(RNA_counts_pan) %in% design_deseq2$sample) #Must be true

all(colnames(RNA_counts_pan) == design_deseq2$sample) #Must be true

#Put in the same order
#Re-order equally
design_deseq2_2<-design_deseq2[order(factor(design_deseq2$sample, levels = colnames(RNA_counts_pan))),]




all(colnames(RNA_counts_pan) == design_deseq2_2$sample) ###all must be true
  
  
  

##Perform Deseq2 count normalization using imput matrix type
library("DESeq2")
DESEq_pancancer <- DESeqDataSetFromMatrix(countData = RNA_counts_pan,
                              colData = design_deseq2_2,
                              design = ~ condition)


DESEq_pancancer

##Run deseq2

dim(DESEq_pancancer)


keep <- rowSums(counts(DESEq_pancancer)) >= 4
length(keep)

DESEq_pancancer <- DESEq_pancancer[keep,]

dim(DESEq_pancancer)

#No genes deleted

#check design
design(DESEq_pancancer)


dds_DESEq_pancancer <- DESeq(DESEq_pancancer)

vsd_DESEq_pancancer <- vst(dds_DESEq_pancancer, blind=FALSE)

DESEq_pancancer_normalized<-assay(vsd_DESEq_pancancer)



####################
##################Mutataional data
###
#Load clinical data
library(readxl)
TCGA_clinical <- read_excel("/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/TCGA-CDR-SupplementalTableS1.xlsx")
TCGA_clinical<-as.data.frame(TCGA_clinical)

# # using a TCGAutils function
library(TCGAutils)
   #TCGA codes
bar_codes_pancancer$sample <- as.character(bar_codes_pancancer$sample) # TCGAutils needs to be as character
bar_codes_pancancer$bcr_patient_barcode <- TCGAbarcode(bar_codes_pancancer$sample, participant = TRUE) # takes out some data from xbarcode that is not needed

###Subset clinical data to contain only tumoral sample data from the RNA-seq
  TCGA_clinical2<-subset(TCGA_clinical, (TCGA_clinical$bcr_patient_barcode %in%  bar_codes_pancancer$bcr_patient_barcode))

# load pancancer .maf file, according to your local path
maf_panCancer <- read.maf("/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/mc3.v0.2.8.PUBLIC.maf.gz")

# adapt Tumor_Sample_Barcode to same length as in GBM and LGG, 12
# using a TCGAutils function
xbarcode <- maf_panCancer@data$Tumor_Sample_Barcode   #TCGA codes
xbarcode <- as.character(xbarcode) # TCGAutils needs to be as character
xbarcode_v2 <- TCGAbarcode(xbarcode, participant = TRUE) # takes out some data from xbarcode that is not needed

maf_panCancer@data$Tumor_Sample_Barcode <- xbarcode_v2
maf_panCancer@data$Tumor_Sample_Barcode <- as.factor(maf_panCancer@data$Tumor_Sample_Barcode) # maftools needs as.factor

# estimate variant allele frequency (VAF) 
maf_panCancer@data$VAF <- maf_panCancer@data$t_alt_count / (maf_panCancer@data$t_ref_count + maf_panCancer@data$t_alt_count)

####Make a subset for each tumor type, matching data from TCGA_clinal2


maf_ACC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="ACC")]))
maf_BLCA<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="BLCA")]))
maf_BRCA<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="BRCA")]))
maf_CESC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="CESC")]))
maf_COAD<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="COAD")]))
maf_DLBC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="DLBC")]))
maf_GBM<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="GBM")]))
maf_HNSC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="HNSC")]))
maf_KICH<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="KICH")]))
maf_KIRC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="KIRC")]))
maf_KIRP<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="KIRP")]))
#
maf_LAML<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="LAML")]))
maf_LGG<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="LGG")]))
maf_LIHC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="LIHC")]))
maf_LUAD<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="LUAD")]))
maf_LUSC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="LUSC")]))
maf_OV<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="OV")]))
maf_PRAD<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="PRAD")]))
maf_READ<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="READ")]))
maf_SKCM<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="SKCM")]))
maf_STAD<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="STAD")]))
maf_THCA<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="THCA")]))
maf_UCEC<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="UCEC")]))
maf_UCS<-subsetMaf(maf_panCancer, tsb = c(TCGA_clinical2$bcr_patient_barcode[which(TCGA_clinical2$type=="UCS")]))
#There are some missing samples..

###Load manual AICDA_enrichment and APOBEC_enrichment extraction functions
load("/media/isaias.hernandez/seagate_ICM/FFPE_PCNSL_98n/FF_vs_FFPE/AICDA_enrich_function.RData") #AICDA enrichment Function
load("/media/isaias.hernandez/seagate_ICM/FFPE_PCNSL_98n/FF_vs_FFPE/APOBEC_enrich_function.RData")



##The functions extracts the nucleotide context (tri or tetranucleoitdes), the enrichment tables and a maf of only enriched mutations...

#ACC
ACC_AICDA<-AICDA_enrichment(maf = maf_ACC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#enriched in  12.658 % of samples

ACC_APOBEC<-APOBEC_enrichment(maf = maf_ACC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)# enriched in  18.987 % of samples

#BLCA
BLCA_AICDA<-AICDA_enrichment(maf = maf_BLCA, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60) #enriched in  0.983 % of samples

BLCA_APOBEC<-APOBEC_enrichment(maf = maf_BLCA, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#enriched in  80.098 % of samples

#BRCA
BRCA_AICDA<-AICDA_enrichment(maf = maf_BRCA, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#enriched in  5.813 % of samples

BRCA_APOBEC<-APOBEC_enrichment(maf = maf_BRCA, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)# enriched in  27.586 % of samples


#CESC
CESC_AICDA<-AICDA_enrichment(maf = maf_CESC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)# enriched in  2.797 % of samples

CESC_APOBEC<-APOBEC_enrichment(maf = maf_CESC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#enriched in  70.979 % of samples


##COAD
COAD_AICDA<-AICDA_enrichment(maf = maf_COAD, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#enriched in  0.249 %

COAD_APOBEC<-APOBEC_enrichment(maf = maf_COAD, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)# enriched in  0.498 % of samples


#DLBCL
DLBC_AICDA<-AICDA_enrichment(maf = maf_DLBC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#enriched in  5.405 % of samples

DLBC_APOBEC<-APOBEC_enrichment(maf = maf_DLBC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#enriched in  0 % of samples


##GBM
GBM_AICDA<-AICDA_enrichment(maf = maf_GBM, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#enriched in  2.564 % of samples

GBM_APOBEC<-APOBEC_enrichment(maf = maf_GBM, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  0 % of samples

##HNSC
HNSC_AICDA<-AICDA_enrichment(maf = maf_HNSC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#enriched in  2.222 % of samples

HNSC_APOBEC<-APOBEC_enrichment(maf = maf_HNSC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  43.03 % of samples


##KICH
KICH_AICDA<-AICDA_enrichment(maf = maf_KICH, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  21.212 % of samples

KICH_APOBEC<-APOBEC_enrichment(maf = maf_KICH, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  4.545 % of samples


##KIRC
KIRC_AICDA<-AICDA_enrichment(maf = maf_KIRC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  10.899 % of samples

KIRC_APOBEC<-APOBEC_enrichment(maf = maf_KIRC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  3.27 % of samples


##KIRP
KIRP_AICDA<-AICDA_enrichment(maf = maf_KIRP, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  12.5 % of samples

KIRP_APOBEC<-APOBEC_enrichment(maf = maf_KIRP, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  8.571 % of samples


##LAML
LAML_AICDA<-AICDA_enrichment(maf = maf_LAML, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  21.818 % of samples, among highest since it is Acute Myeloid Leukemia

LAML_APOBEC<-APOBEC_enrichment(maf = maf_LAML, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  3.636 % of samples


##LGG
LGG_AICDA<-AICDA_enrichment(maf = maf_LGG, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  8.087 % of samples

LGG_APOBEC<-APOBEC_enrichment(maf = maf_LGG, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  1.775 % of samples


##LIHC
LIHC_AICDA<-AICDA_enrichment(maf = maf_LIHC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  4.202 % of samples

LIHC_APOBEC<-APOBEC_enrichment(maf = maf_LIHC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  3.641 % of samples



#LUAD
LUAD_AICDA<-AICDA_enrichment(maf = maf_LUAD, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  3.131 % of samples

LUAD_APOBEC<-APOBEC_enrichment(maf = maf_LUAD ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  28.18 % of samples


#LUSC
LUSC_AICDA<-AICDA_enrichment(maf = maf_LUSC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  1.253 % of samples

LUSC_APOBEC<-APOBEC_enrichment(maf = maf_LUSC ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  28.601 % of samples



#OV
OV_AICDA<-AICDA_enrichment(maf = maf_OV, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  4.545 % of samples

OV_APOBEC<-APOBEC_enrichment(maf = maf_OV ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  5.245 % of samples



#PRAD
PRAD_AICDA<-AICDA_enrichment(maf = maf_PRAD, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  8.554 % of samples

PRAD_APOBEC<-APOBEC_enrichment(maf = maf_PRAD ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  4.888 % of samples


#READ
READ_AICDA<-AICDA_enrichment(maf = maf_READ, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  1.389 % of samples

READ_APOBEC<-APOBEC_enrichment(maf = maf_READ ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  0 % of samples


#SKCM
SKCM_AICDA<-AICDA_enrichment(maf = maf_SKCM, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  1.293 % of samples

SKCM_APOBEC<-APOBEC_enrichment(maf = maf_SKCM ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  2.155 % of samples



#STAD
STAD_AICDA<-AICDA_enrichment(maf = maf_STAD, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  1.205 % of samples

STAD_APOBEC<-APOBEC_enrichment(maf = maf_STAD ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  3.373 % of samples

#THCA, highest values for both
THCA_AICDA<-AICDA_enrichment(maf = maf_THCA, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  20.168 % of samples

THCA_APOBEC<-APOBEC_enrichment(maf = maf_THCA ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  20.588  % of samples

#UCEC

UCEC_AICDA<-AICDA_enrichment(maf = maf_UCEC, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  2.652 % of samples

UCEC_APOBEC<-APOBEC_enrichment(maf = maf_UCEC ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)#  enriched in  6.439  % of samples

#UCS
UCS_AICDA<-AICDA_enrichment(maf = maf_UCS, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60)#AICDA related mutations are enriched in  3.509 % of samples

UCS_APOBEC<-APOBEC_enrichment(maf = maf_UCS ,ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE)
#  enriched in  7.018  % of samples




###Bind all AICDA tables and APOBEC  tables together
panCancer_AICDA_scores<-rbind(ACC_AICDA$AICDA_scores,BLCA_AICDA$AICDA_scores,BRCA_AICDA$AICDA_scores,CESC_AICDA$AICDA_scores,COAD_AICDA$AICDA_scores,DLBC_AICDA$AICDA_scores,GBM_AICDA$AICDA_scores,HNSC_AICDA$AICDA_scores,KICH_AICDA$AICDA_scores,KIRC_AICDA$AICDA_scores,KIRP_AICDA$AICDA_scores, LAML_AICDA$AICDA_scores, LGG_AICDA$AICDA_scores, LIHC_AICDA$AICDA_scores, LUAD_AICDA$AICDA_scores, LUSC_AICDA$AICDA_scores, OV_AICDA$AICDA_scores, PRAD_AICDA$AICDA_scores, READ_AICDA$AICDA_scores, SKCM_AICDA$AICDA_scores, STAD_AICDA$AICDA_scores, THCA_AICDA$AICDA_scores, UCEC_AICDA$AICDA_scores, UCS_AICDA$AICDA_scores)

panCancer_APOBEC_scores<-rbind(ACC_APOBEC$APOBEC_scores,BLCA_APOBEC$APOBEC_scores, BRCA_APOBEC$APOBEC_scores, CESC_APOBEC$APOBEC_scores, COAD_APOBEC$APOBEC_scores, DLBC_APOBEC$APOBEC_scores, GBM_APOBEC$APOBEC_scores, HNSC_APOBEC$APOBEC_scores, KICH_APOBEC$APOBEC_scores, KIRC_APOBEC$APOBEC_scores, KIRP_APOBEC$APOBEC_scores, LAML_APOBEC$APOBEC_scores, LGG_APOBEC$APOBEC_scores, LIHC_APOBEC$APOBEC_scores, LUAD_APOBEC$APOBEC_scores, LUSC_APOBEC$APOBEC_scores, OV_APOBEC$APOBEC_scores, PRAD_APOBEC$APOBEC_scores, READ_APOBEC$APOBEC_scores, SKCM_APOBEC$APOBEC_scores, STAD_APOBEC$APOBEC_scores, THCA_APOBEC$APOBEC_scores, UCEC_APOBEC$APOBEC_scores, UCS_APOBEC$APOBEC_scores)

#maf
panCancer_AICDA_maf<-rbind(ACC_AICDA$maf_AICDA,BLCA_AICDA$maf_AICDA,BRCA_AICDA$maf_AICDA,CESC_AICDA$maf_AICDA,COAD_AICDA$maf_AICDA,DLBC_AICDA$maf_AICDA,GBM_AICDA$maf_AICDA,HNSC_AICDA$maf_AICDA,KICH_AICDA$maf_AICDA,KIRC_AICDA$maf_AICDA,KIRP_AICDA$maf_AICDA, LAML_AICDA$maf_AICDA, LGG_AICDA$maf_AICDA, LIHC_AICDA$maf_AICDA, LUAD_AICDA$maf_AICDA, LUSC_AICDA$maf_AICDA, OV_AICDA$maf_AICDA, PRAD_AICDA$maf_AICDA, READ_AICDA$maf_AICDA, SKCM_AICDA$maf_AICDA, STAD_AICDA$maf_AICDA, THCA_AICDA$maf_AICDA, UCEC_AICDA$maf_AICDA, UCS_AICDA$maf_AICDA)

panCancer_APOBEC_maf<-rbind(ACC_APOBEC$MAF_APOBEC,BLCA_APOBEC$MAF_APOBEC, BRCA_APOBEC$MAF_APOBEC, CESC_APOBEC$MAF_APOBEC, COAD_APOBEC$MAF_APOBEC, DLBC_APOBEC$MAF_APOBEC, GBM_APOBEC$MAF_APOBEC, HNSC_APOBEC$MAF_APOBEC, KICH_APOBEC$MAF_APOBEC, KIRC_APOBEC$MAF_APOBEC, KIRP_APOBEC$MAF_APOBEC, LAML_APOBEC$MAF_APOBEC, LGG_APOBEC$MAF_APOBEC, LIHC_APOBEC$MAF_APOBEC, LUAD_APOBEC$MAF_APOBEC, LUSC_APOBEC$MAF_APOBEC, OV_APOBEC$MAF_APOBEC, PRAD_APOBEC$MAF_APOBEC, READ_APOBEC$MAF_APOBEC, SKCM_APOBEC$MAF_APOBEC, STAD_APOBEC$MAF_APOBEC, THCA_APOBEC$MAF_APOBEC, UCEC_APOBEC$MAF_APOBEC, UCS_APOBEC$MAF_APOBEC)

#3nt context taking all pancancer mutations per tumor type
panCancer_3nt_matrix<-rbind(colSums(ACC_APOBEC$nmf_matrix),colSums(BLCA_APOBEC$nmf_matrix),colSums(BRCA_APOBEC$nmf_matrix),colSums(CESC_APOBEC$nmf_matrix),colSums(COAD_APOBEC$nmf_matrix),colSums(DLBC_APOBEC$nmf_matrix),colSums(GBM_APOBEC$nmf_matrix),colSums(HNSC_APOBEC$nmf_matrix),colSums(KICH_APOBEC$nmf_matrix),colSums(KIRC_APOBEC$nmf_matrix),colSums(KIRP_APOBEC$nmf_matrix), colSums(LAML_APOBEC$nmf_matrix), colSums(LGG_APOBEC$nmf_matrix), colSums(LIHC_APOBEC$nmf_matrix),colSums(LUAD_APOBEC$nmf_matrix), colSums(LUSC_APOBEC$nmf_matrix), colSums(OV_APOBEC$nmf_matrix), colSums(PRAD_APOBEC$nmf_matrix), colSums(READ_APOBEC$nmf_matrix), colSums(SKCM_APOBEC$nmf_matrix), colSums(STAD_APOBEC$nmf_matrix), colSums(THCA_APOBEC$nmf_matrix), colSums(UCEC_APOBEC$nmf_matrix), colSums(UCS_APOBEC$nmf_matrix))
rownames(panCancer_3nt_matrix)<-Tumor_type

##Save 5nt matrix as sum per tumor type (all mutations)
panCancer_5nt_matrix<-rbind(colSums(ACC_AICDA$nmf_matrix),colSums(BLCA_AICDA$nmf_matrix),colSums(BRCA_AICDA$nmf_matrix),colSums(CESC_AICDA$nmf_matrix),colSums(COAD_AICDA$nmf_matrix),colSums(DLBC_AICDA$nmf_matrix),colSums(GBM_AICDA$nmf_matrix),colSums(HNSC_AICDA$nmf_matrix),colSums(KICH_AICDA$nmf_matrix),colSums(KIRC_AICDA$nmf_matrix),colSums(KIRP_AICDA$nmf_matrix), colSums(LAML_AICDA$nmf_matrix), colSums(LGG_AICDA$nmf_matrix), colSums(LIHC_AICDA$nmf_matrix),colSums(LUAD_AICDA$nmf_matrix), colSums(LUSC_AICDA$nmf_matrix), colSums(OV_AICDA$nmf_matrix), colSums(PRAD_AICDA$nmf_matrix), colSums(READ_AICDA$nmf_matrix), colSums(SKCM_AICDA$nmf_matrix), colSums(STAD_AICDA$nmf_matrix), colSums(THCA_AICDA$nmf_matrix), colSums(UCEC_AICDA$nmf_matrix), colSums(UCS_AICDA$nmf_matrix))
rownames(panCancer_5nt_matrix)<-Tumor_type

##Get 5nt matrix for only AICDA mutations...


##Subset TCGA_clinal to contain same samples as the previous tables and merge data


TCGA_clinical4<-subset(TCGA_clinical2, TCGA_clinical2$bcr_patient_barcode %in% panCancer_AICDA_scores$Tumor_Sample_Barcode) #Subset so they have the same info

panCancer_AICDA_scores<-panCancer_AICDA_scores[order(factor(panCancer_AICDA_scores$Tumor_Sample_Barcode, levels = TCGA_clinical4$bcr_patient_barcode)),]#put in same order
all(TCGA_clinical4$bcr_patient_barcode == panCancer_AICDA_scores$Tumor_Sample_Barcode)#must be true
colnames(panCancer_AICDA_scores)[c(39,44)]<-c("fisher_pvalue_AICDA","fdr_AICDA")#Change some colnames

panCancer_APOBEC_scores<-panCancer_APOBEC_scores[order(factor(panCancer_APOBEC_scores$Tumor_Sample_Barcode, levels = TCGA_clinical4$bcr_patient_barcode)),]#put in same order
all(TCGA_clinical4$bcr_patient_barcode == panCancer_APOBEC_scores$Tumor_Sample_Barcode)#must be true
colnames(panCancer_APOBEC_scores)[c(39,44)]<-c("fisher_pvalue_APOBEC","fdr_APOBEC")#Change some colnames


TCGA_clinical4<- cbind(TCGA_clinical4, panCancer_AICDA_scores[,c(18,20:28,36:39,43:44)],panCancer_APOBEC_scores[,c(20:28,36:39,43:44)]) #add panCancerAnnotation data 


#####Adding AICDA tag
####Get the signature contribution of every tumor type using also the AICDA_canonical signature
#Prepare "vcf" from maf_pancancer global mutations
###Make a subset from the global pancancer to contain the samples included in the previous analysis


maf_panCancer_global<-maf_panCancer@data[maf_panCancer@data$Tumor_Sample_Barcode %in% panCancer_AICDA_maf$Tumor_Sample_Barcode,]#Subset to contain the same samples as AICDA pancancer
maf_panCancer_global<-as.data.frame(maf_panCancer_global)

maf_panCancer_global$Tumor_type<-TCGA_clinical2$type[match(maf_panCancer_global$Tumor_Sample_Barcode,TCGA_clinical2$bcr_patient_barcode)]#Add tumor type to pancancer maf object

maf_panCancer_global$Mut_identifier<-paste0(maf_panCancer_global$Tumor_Sample_Barcode,"-",maf_panCancer_global$Start_Position,"-",maf_panCancer_global$CONTEXT,"-",maf_panCancer_global$t_alt_count) ##To Id AICDA mutations easier

panCancer_AICDA_maf$Mut_identifier<-paste0(panCancer_AICDA_maf$Tumor_Sample_Barcode,"-",panCancer_AICDA_maf$Start_Position,"-",panCancer_AICDA_maf$CONTEXT,"-",panCancer_AICDA_maf$t_alt_count)



maf_panCancer_global$AICDA_mutation<-ifelse(maf_panCancer_global$Mut_identifier %in% panCancer_AICDA_maf$Mut_identifier, "YES","NO")###Add a column next to each mutation to indicate if it is AICDA induced or not



  
####################
##################Mutational signatures..
###
# prepare data as input to dndscv package to detect gene drivers
maf_drivers <- maf_panCancer_global[,c("Tumor_Sample_Barcode", "Variant_Type", "Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count", "t_depth", "n_depth", "Hugo_Symbol","Tumor_type")]
    signif_genes<-list() ;signif_genes_top<-list()                            
maf_panCancer_global$Driver<-0

for (i in 13:24) {
  maf_drivers_tmp<-maf_drivers[maf_drivers$Tumor_type %in% Tumor_type[i],]
 # maf_drivers_tmp<-maf_drivers_tmp[maf_drivers_tmp$Variant_Type %in%"SNP",] #improves calculation
  ##Apply per tumor type   
dndsout = dndscv(maf_drivers_tmp[,c(1,3,4,5,6)])
# 
sel_cv= dndsout$sel_cv
signif_genes[[i]] = sel_cv[sel_cv$qglobal_cv<0.01, c("gene_name","qglobal_cv")]
rownames(signif_genes[[i]]) = NULL

signif_genes_top[[i]] <- signif_genes[[i]][1:25,1]

# add driver feature in the maf file                  
maf_panCancer_global$Driver[maf_panCancer_global$Tumor_type %in% Tumor_type[i]] <- ifelse( maf_panCancer_global$Hugo_Symbol[maf_panCancer_global$Tumor_type %in% Tumor_type[i]] %in% signif_genes_top[[i]], 1, 0)
  
}
##LAML could not be calculated, do for 1:11 and then 13:24

names(signif_genes_top)<-Tumor_type


maf_panCancer_NO_AICDA<-maf_panCancer_global[maf_panCancer_global$AICDA_mutation=="NO",c("Tumor_Sample_Barcode", "Variant_Type", "Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count", "t_depth", "n_depth", "Driver","Tumor_type","Hugo_Symbol")]    ##SUbset to not associated AICDA mutations and only data for palimpsest
    
    maf_panCancer_NO_AICDA<-as.data.frame(maf_panCancer_NO_AICDA)
# New version suports SNP, DEL and INS so there is no need to subset only SNP
    # load data from Palimpsest package, according to its path                  

colnames(maf_panCancer_NO_AICDA) <- c(colnames(vcf[1:10]),"Tumor_type","Gene_Name")

maf_panCancer_NO_AICDA$Type <- ifelse(maf_panCancer_NO_AICDA$Type == "SNP", yes = "SNV", ifelse(maf_panCancer_NO_AICDA$Type == "DEL", yes = "DEL","INS"))


#Preparing input data for mutational signature analysis
##Annotate mutations
vcf_panCancer_NO_AICDA<-annotate_VCF(vcf=maf_panCancer_NO_AICDA,ref_genome =BSgenome.Hsapiens.UCSC.hg19, ref_fasta=NULL , add_ID_cats=F,add_DBS_cats = F)
dim(vcf_panCancer_NO_AICDA)


##Do a loop for each tumor type.....extracting a specific AICDA_signature per cancer type...
SBS_input_pancancer_NO_AICDA<-list(); SBS_signatures_exp_pancancer_NO_AICDA <-list()

exclude_sigs<-c("SBS27","SBS39","SBS43","SBS45" , "SBS46" , "SBS47" , "SBS48"  ,"SBS49" , "SBS50" , "SBS51" , "SBS52" , "SBS53" , "SBS54" , "SBS55","SBS56" , "SBS57" , "SBS58" , "SBS59" , "SBS60")#Signatures not to include to see if it helps the fitting,  18 are posible sequencing artefacts; SBS39 posible noise causing with AICDA signature

for (i in 1:24) {
SBS_cosmic_bytumor<-SBS_cosmic[setdiff(rownames(SBS_cosmic),exclude_sigs),]##l

##Plot cosine similarities
##Apply now to global maf  

  vcf_panCancer_NO_AICDA_tmp<-vcf_panCancer_NO_AICDA[vcf_panCancer_NO_AICDA$Tumor_type %in% Tumor_type[i],] #Subset by tumor type
  SBS_input_pancancer_NO_AICDA[[i]]<-palimpsest_input(vcf = vcf_panCancer_NO_AICDA_tmp, Type = "SBS") #Get tnm proportions and numbers by sample
  
resdir.2<-paste0("/media/user/seagate_ICM/AICDA_pancancer/Mutational_signatures/Semi_deconvolutionApproach",Tumor_type[i],"/");if(!file.exists(resdir.2)) dir.create(resdir.2) #Create directory if it does not exist


#sig_cols<-signature_colour_generator(rownames(SBS_cosmic_bytumor)) #Asign colors
load("r/Mutational_Signature_colors.RData") #Color for signatures

SBS_signatures_exp_pancancer_NO_AICDA[[i]] <- deconvolution_fit(input_vcf = vcf_panCancer_NO_AICDA_tmp,input_matrices = SBS_input_pancancer_NO_AICDA[[i]],input_signatures = SBS_cosmic_bytumor,signature_colours = sig_cols,resdir = resdir.2,doplot = F,save_signatures_exp = F) #Calcules weight for each signature
select <- which(colMeans(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums) > 1) #Select signatures with real contribution
sig_cols<-signature_colour_generator(names(select))

##Re-fit using only selected signatures
#selected mutational known signatures in tumor[i] samples   
X<-as.character(names(select))
signatures3 <-SBS_cosmic_bytumor[X,]

SBS_signatures_exp_pancancer_NO_AICDA[[i]] <- deconvolution_fit(input_vcf = vcf_panCancer_NO_AICDA_tmp,input_matrices = SBS_input_pancancer_NO_AICDA[[i]],input_signatures = signatures3,signature_colours = sig_cols,resdir = resdir.2,doplot = F,save_signatures_exp = T) #Calcules weight for each signature, re-fit to improve visualization
  
#####now add a column for the AICDA_canonical using the previously obtain numbers, add at $sig_nums
SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums$AICDA_canonical<-TCGA_clinical4$AICDA_mutations[TCGA_clinical4$bcr_patient_barcode %in% rownames(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums)]

###Now recalculate each sig_props using the new mutations numbers
SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props$AICDA_canonical<-0
SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props<-round(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums[,]/rowSums(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums[,]),7)




}


 names(SBS_signatures_exp_pancancer_NO_AICDA)<-Tumor_type[]
 
##This is similar to Figure 1D

 p_signatures_semideconv<-list();total_n<-rep(1,24)
for (i in 1:24) {
  order<-NULL
  if (is.null(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props$AICDA_canonical)=="FALSE") {
  order<-order(colSums(t(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props$AICDA_canonical)), decreasing = T)#Order according to highest AICDA canonical proportion (if present)
  }
  order_sigs<-colnames(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props)
  if (is.null(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props$SBS9)=="FALSE") {
  order_sigs<-  order_sigs[c(which(order_sigs!="SBS9"),which(order_sigs=="SBS9"))]
}#Put AICDA and SBS9 together (if present)
  if (is.null(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props$AICDA_canonical)=="FALSE") {
    order_sigs<-order_sigs[c(which(order_sigs!="AICDA_canonical"),which(order_sigs=="AICDA_canonical"))]
  }
  p_signatures_semideconv[[i]]<-signature_exposure_plot2(regression_line = "AICDA_canonical",order = order,signature_colours = sig_cols,mutSign_nums =   SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums[,order_sigs],mutSign_props =SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props[,order_sigs] )+ ggtitle(paste0("Mutational Signature exposure in ",names(SBS_signatures_exp_pancancer_NO_AICDA[i])," (n = ",nrow(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums),")"))+theme(axis.text.x=element_blank())+xlab(paste0(names(SBS_signatures_exp_pancancer_NO_AICDA[i])," (n = ",nrow(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums),")")) #Remove samples names
  total_n[i]<-nrow(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums)
}
names(total_n)<-Tumor_type

###Arrange together using grid
gb_psignatures_semideconv<-list()
for (i in 1:24) {
  if (i%in%c(1,6,11,16,21)) {
    gb_psignatures_semideconv[[i]] = grid.grabExpr({
    print(p_signatures_semideconv[[i]]+theme(legend.position='none')+theme(plot.title = element_blank(), axis.ticks.x=element_blank(),axis.line.x=element_blank())+scale_y_continuous(expand = c(0,0)))
})
  }else{
  if(i %in% setdiff(1:24,c(1,6,11,16,21))){
    gb_psignatures_semideconv[[i]] = grid.grabExpr({
    print(p_signatures_semideconv[[i]]+theme(legend.position='none')+ theme(plot.title = element_blank(), axis.ticks.x=element_blank(),axis.line.x=element_blank(), axis.ticks.y=element_blank(),axis.line.y=element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank())+scale_y_continuous(expand = c(0,0)) )
})}
  }
}


###Arrange by 5
gb_psignatures_semideconv_by5<-list()

for (j in 0:4) {
  
grid.newpage()
  k<-1+(5*j)
for (i in 0:4) {
  
pushViewport(viewport(y = 0, x=0.1+(.2*i),height = 1, width = .2, just = c("bottom")))
grid.draw(gb_psignatures_semideconv[[i+k]])
popViewport()
}
gb_psignatures_semideconv_by5[[j+1]]<- recordPlot()
}
gb_psignatures_semideconv_by5[[5]]<-recordPlot()

library(gridGraphics)
library(grid)
grid.echo()
gb_psignatures_semideconv_by5_2<-list()
gb_psignatures_semideconv_by5[[1]]
gb_psignatures_semideconv_by5_2[[1]] <- grid.grab()
gb_psignatures_semideconv_by5[[2]]
gb_psignatures_semideconv_by5_2[[2]] <- grid.grab()
gb_psignatures_semideconv_by5[[3]]
gb_psignatures_semideconv_by5_2[[3]] <- grid.grab()
gb_psignatures_semideconv_by5[[4]]
gb_psignatures_semideconv_by5_2[[4]] <- grid.grab()
gb_psignatures_semideconv_by5[[5]]
gb_psignatures_semideconv_by5_2[[5]] <- grid.grab()
#grab legend
##for legend
sig_cols_plot_semi<-"SBS1"

for (i in 1:24) {
  sig_cols_plot_semi<-c(sig_cols_plot_semi,colnames(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_props))
}

sig_cols_plot_semi<-unique(sig_cols_plot_semi);sig_cols_plot_semi<-sig_cols[sig_cols_plot_semi]
sig_cols_plot_semi<-sig_cols_plot_semi[order(factor(names(sig_cols_plot_semi), levels = names(sig_cols)))]#reorder

lgd_semi = Legend(labels = names(sig_cols_plot_semi), legend_gp = gpar(fill = sig_cols_plot_semi), 
    title = "SBS signatures", nrow = 3,title_position = "topcenter",title_gp = gpar(col = "black", fontsize = 14,fontface="bold"))
## draw it, changes optional
grid.newpage()
grid.draw( editGrob(gb_psignatures_semideconv_by5_2[[1]], vp=viewport(y = 0.82, x=0.5,height = .18, width = 1, just = c("bottom")), gp=gpar(fontsize=10)))
grid.draw( editGrob(gb_psignatures_semideconv_by5_2[[2]], vp=viewport(y = 0.64, x=0.5,height = .18, width = 1, just = c("bottom")), gp=gpar(fontsize=10)))
grid.draw( editGrob(gb_psignatures_semideconv_by5_2[[3]], vp=viewport(y = 0.46, x=0.5,height = .18, width = 1, just = c("bottom")), gp=gpar(fontsize=10)))
grid.draw( editGrob(gb_psignatures_semideconv_by5_2[[4]], vp=viewport(y = 0.28, x=0.5,height = .18, width = 1, just = c("bottom")), gp=gpar(fontsize=10)))
grid.draw( editGrob(gb_psignatures_semideconv_by5_2[[5]], vp=viewport(y = 0.1, x=0.5,height = .18, width = 1, just = c("bottom")), gp=gpar(fontsize=10)))

pushViewport(viewport(  y = 0.075,x=.5,height = .15,width =.8))
grid.draw(lgd_semi)


p_pancancer_signatures_semideconv<-recordPlot()


##This is similar to Figure 1C

###Summary graph of signatures per tumor type, using corrplot + other packages
##Get median signature weight by tumor for each signature (this is going to be represented by the circle size)

##Get edian signature weight by tumor for each signature (this is going to be represented by the circle size)
signature_proportions_semi<-as.data.frame(x = matrix(data=0,nrow = 24,ncol = length(sig_cols),dimnames  = list(Tumor_type,names(sig_cols)))) #Create empty data frame to fill
signature_mutations_median_semi<-as.data.frame(x = matrix(data=0,nrow = 24,ncol = length(sig_cols),dimnames  = list(Tumor_type,names(sig_cols))))
for (i in 1:24) {
    count_tmp<-colSums(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums[,])/sum(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums)
  
    signature_proportions_semi[i,match(names(count_tmp),colnames(signature_proportions_semi))]<-count_tmp
    
    count_tmp<-SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums
    for (j in 1:ncol(count_tmp)) {
        count_tmp2<-count_tmp[,j]
        count_tmp2<- count_tmp2[count_tmp2>0]
        signature_mutations_median_semi[i,match(names(count_tmp)[j],colnames(signature_mutations_median_semi))]<-median(count_tmp2)/38 #To calculate the TMB per megabase, the total number of mutations counted is divided by the size of the coding region of the targeted territory; We used 38 Mb as the estimate of the exome size.
    }
    
}




#Change a little the signatures order to improve analysis 
signature_proportions_semi2<-signature_proportions_semi[,c(1:11,47,12,13:46)] #Reorder to put AICDA next to SBS9
signature_mutations_median_semi2<-signature_mutations_median_semi[,c(1:11,47,12,13:46)]
rownames(signature_mutations_median_semi2)<-paste0(rownames(signature_mutations_median_semi)," (n = ", total_n, " )")
signature_mutations_median_semi2<-signature_mutations_median_semi2[,colnames(signature_mutations_median_semi2[colSums(signature_mutations_median_semi2)>0])]#Eliminate signatures not present in any tumor type

signature_proportions_semi2<-signature_proportions_semi2[,colnames(signature_mutations_median_semi2)]
library(corrplot)
signature_proportions_semi3<-signature_proportions_semi2#To set blanks where there is no signature signal
signature_proportions_semi3<-ifelse(signature_proportions_semi3==0, 1,0.000)

#display.brewer.all() to check color list
#708090
#For background colors where there are values
ColorScheme = c("ivory1", "ivory2")
Groups = c(rep(1,24), rep(2,24))
#) take out legend

#For background colors where there are values where there are values 0 we have to refill according to previous background color

ColorScheme_insignificant<-ifelse(signature_proportions_semi3==1, 1,NA)
ColorScheme_insignificant[,seq(2,ncol(ColorScheme_insignificant),by=2)]<-ifelse(ColorScheme_insignificant[,seq(2,ncol(ColorScheme_insignificant),by=2)]==1, "ivory2",NA)
ColorScheme_insignificant[,seq(1,ncol(ColorScheme_insignificant),by=2)]<-ifelse(ColorScheme_insignificant[,seq(1,ncol(ColorScheme_insignificant),by=2)]==1, "ivory1",NA)

ColorScheme_insignificant<-ColorScheme_insignificant[!is.na(ColorScheme_insignificant)]
##Make color and size legends manually

max(signature_proportions_semi2)
[1] 0.5085297
 min(signature_proportions_semi2)
[1] 0

###MUltiple size vector by a factor to increase circle size (since max number is .508)
corrplot_circleSize(is.corr = FALSE,corr = as.matrix(signature_mutations_median_semi2),na.label = " ",col = c(brewer.pal(n = 10, name = "Set3"),brewer.pal(n = 10, name = "Paired")), tl.col = "black", order = "original", tl.cex = 0.6,size_vector= as.numeric(as.matrix(signature_proportions_semi2[,])*1.4),method = "circle", cl.pos = "n",p.mat = signature_proportions_semi3, insig = "blank",bg = ColorScheme[Groups],bg_insig  = ColorScheme_insignificant,mar = c(0,1,0,0),at = c(0,0.05,.10,.25,.5,1,2.5,5,10,25),color_at = c("white",brewer.pal(n = 9, name = "Paired")))


##Add manual rectangles
segments(c(9.5,11.5), rep(0.5,2), c(9.5,11.5), rep(24.5,2), lwd=3) #move 1 in x per box, vertical lines (x0,y0,x1,y1)


segments(c(9.5, 9.5), c(0.5, 24.5), c(11.5, 11.5), c(0.5,24.5), lwd=3) #Horizontal lines

#Obtional, add description to each SBS text(x = 1, y = 26.2, "(H", srt = 90,cex=0.6) 

#Add text

#Grab

lgd2 = Legend(at = rev(c(0,0.05,.10,.25,.5,1,2.5,5,10,25)), legend_gp = gpar(fill = rev(c("white",brewer.pal(n = 9, name = "Paired")))), 
              title = "Median mutations per Mb due signature \n(among tumors with signature)", border = "black",title_gp = gpar(col = "black", fontsize = 10,fontface="bold"),ncol = 1, title_position = "leftcenter-rot",grid_height = unit(9, "mm"), grid_width = unit(9, "mm"))
## draw it, changes optional (Legend for circle colors)

lgd3 = Legend( title = "Signature weight contribution \nwithin tumor type", type = "points", at =seq(0,.54,by=.06),title_gp = gpar(col = "black", fontsize = 10,fontface="bold"), pch = 19, legend_gp = gpar(col = "ivory4",cex=seq(0,.54,by=.06)),size =unit(c(0,seq(1.2,4.4,by=.4)*2.4), "mm") ,ncol=1,border = "white",title_position = "leftcenter-rot",grid_height = unit(9, "mm"), grid_width = unit(9, "mm")) ## draw it, changes optional (Legend for sizes)
## draw it, changes optional (Legend for sizes)  

pd = packLegend( lgd2, lgd3, direction = "vertical",row_gap = unit(.8, "cm"))
pushViewport(viewport(  y = 0.358,x=0.05,height = .1,width =.1,just = c("bottom")))

grid.draw(pd)
p_pann_summary_signatures_semideconv<-recordPlot()

##Save 16*20

####################
##################CLonality analysis...
###

## copy number analysis
#######################
#######################

library(sigminer)                                 

# data frame with CN from ABSOLUTE   
##Download from https://api.gdc.cancer.gov/data/0f4f5701-7b61-41ae-bda9-2805d1ca9781
absolute_pancancer <- read.delim("/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/Pancancer_absolute_data/TCGA_mastercalls.abs_segtabs.fixed.txt")       
# data frame with ploidy from ABSOLUTE from https://api.gdc.cancer.gov/data/4f277128-f793-4354-a13d-30cc7fe9f6b5         
absolute_pancancer_ploidy <- read.delim("/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/Pancancer_absolute_data/TCGA_mastercalls.abs_tables_JSedit.fixed.txt")
# modify sample ID
xbarcode_pan_v2 <- TCGAbarcode(as.character(absolute_pancancer$Sample), participant = TRUE)
absolute_pancancer$Sample <- xbarcode_pan_v2
dim(absolute_pancancer)                  
xbarcode_ploidy_v2 <- TCGAbarcode(as.character(absolute_pancancer_ploidy$array), participant = TRUE)
absolute_pancancer_ploidy$Sample <- xbarcode_ploidy_v2   

dim(absolute_pancancer_ploidy)

# subset samples from which we previously calculated the signatures (maf_panCancer_global)
length(unique(maf_panCancer_global$Tumor_Sample_Barcode)) #7409



absolute_pancancer_ourData <- subset(absolute_pancancer, absolute_pancancer$Sample %in% unique(maf_panCancer_global$Tumor_Sample_Barcode))
      dim(absolute_pancancer_ourData)                           

absolute_pancancer_ploidy_ourData<- subset(absolute_pancancer_ploidy, absolute_pancancer_ploidy$Sample %in% unique(maf_panCancer_global$Tumor_Sample_Barcode))     

dim(absolute_pancancer_ploidy_ourData)
                    
absolute_pancancer_ourData_combined <- merge(absolute_pancancer_ourData, 
                                                    absolute_pancancer_ploidy_ourData, by = "Sample")
dim(absolute_pancancer_ourData_combined)          

# remove NA in CN data
absolute_pancancer_ourData_combined2 <-absolute_pancancer_ourData_combined[complete.cases(absolute_pancancer_ourData_combined[,7:9]),]

# logR 
absolute_pancancer_ourData_combined2$logR <- log2((absolute_pancancer_ourData_combined2$Modal_HSCN_1 + absolute_pancancer_ourData_combined2$Modal_HSCN_2 + 0.3 ) / (absolute_pancancer_ourData_combined2$ploidy ))


##########################################
##########################################
##                                     ###
##          clonality analysis         ###
##                                     ###
##########################################
##########################################  

# prepare copy number file for Palimpsest analysis

# column names required by copy number analysis in Palimpsest
palimpsest_columns <- c("Sample", 
                        "Chromosome", 
                        "Start", 
                        "End", 
                        "logR", 
                        "Modal_HSCN_1", 
                        "Modal_HSCN_2", 
                        "Modal_Total_CN", 
                        "ploidy")   

# column number
col.num <- which(colnames(absolute_pancancer_ourData_combined2) %in% palimpsest_columns)
                                 
# subset the required input for palimpsest                                 
absolute_palimpsest <- absolute_pancancer_ourData_combined2[, col.num]
  dim(absolute_palimpsest)
  
# prepare anno_data file for palimpsest analysis
# subset clinical data to obtain the same samples selected from the ABSOLUTE output
  TCGA_clinical2_absolute<- subset(TCGA_clinical2, 
                                     TCGA_clinical2$bcr_patient_barcode %in% absolute_pancancer_ourData_combined2$Sample)
    dim(TCGA_clinical2_absolute)                      
     #7216   61
    
    
# remove recurrence samples
  dim(absolute_pancancer_ploidy_ourData)
#[1] 7309   11
    
absolute_pancancer_ploidy_ourData2 <- absolute_pancancer_ploidy_ourData[!duplicated(absolute_pancancer_ploidy_ourData$Sample), ]

dim(absolute_pancancer_ploidy_ourData2)
#[1] 7270   11

# common samples
absolute_pancancer_ploidy_ourData3 <- subset(absolute_pancancer_ploidy_ourData2, absolute_pancancer_ploidy_ourData2$Sample %in% absolute_pancancer_ourData_combined2$Sample)
dim(absolute_pancancer_ploidy_ourData3)
#[1] 7216   11

                                   
#Re-order equally
TCGA_clinical2_absolute<-TCGA_clinical2_absolute[order(factor(TCGA_clinical2_absolute$bcr_patient_barcode, levels = absolute_pancancer_ploidy_ourData3$Sample)),]


# sanity check                 
table(unique(TCGA_clinical2_absolute$bcr_patient_barcode) == unique(absolute_pancancer_ploidy_ourData3$Sample)) # should be all TRUE   



absolute_pancancer_ploidy_ourData3$Sex <- TCGA_clinical2_absolute$gender 

# recode Sex variable according to palimpsest
absolute_pancancer_ploidy_ourData3$Sex <- ifelse(absolute_pancancer_ploidy_ourData3$Sex == "Female", 
                                                        "F", 
                                                        "M")      


# palimpsest anno_data file
palimpsest_data_columns <- c("Sample", 
                        "Sex", 
                        "purity")   

# column number
col.num_anno_data <- which(colnames(absolute_pancancer_ploidy_ourData3) %in% palimpsest_data_columns)
                                 
# subset the required input for palimpsest                                 
absolute_anno_data <- absolute_pancancer_ploidy_ourData3[, col.num_anno_data]
# change column order
absolute_anno_data2 <- absolute_anno_data[,c(2,3,1)]                                 

# change column names to palimpsest requirements
colnames(absolute_palimpsest) <- c("Sample", "CHROM", "POS_START", "POS_END", "Nmin", "Nmaj", "ntot", "LogR", "Ploidy")         
colnames(absolute_anno_data2) <- c("Sample", "Gender", "Purity")     
                             
    
# add chr in chromosome's name
absolute_palimpsest$CHROM <- sub("^", "chr", absolute_palimpsest$CHROM )     

#Subset to include the samples that does contain Gender info
unique(absolute_anno_data2$Gender) == "M"
#[1]  TRUE
dim(absolute_anno_data2)
#[1] 7216  3
absolute_anno_data3<-absolute_anno_data2[complete.cases(absolute_anno_data2[,"Gender"]),]
dim(absolute_anno_data3)
#[1] 7216  3


##Subset in absolute_palimpsest
absolute_palimpsest2<-subset(absolute_palimpsest, absolute_palimpsest$Sample %in% absolute_anno_data3$Sample)
length(unique(absolute_palimpsest$Sample))
length(unique(absolute_palimpsest2$Sample))

# Add signature probability columns to the original vcf table
vcf_sigOrigins_allmuts

                                 
# subset common samples in CN
vcf_sigOrigins_allmuts2 <- subset(vcf_sigOrigins_allmuts, 
                              vcf_sigOrigins_allmuts$Sample %in% absolute_palimpsest2$Sample)
                                 
dim(vcf_sigOrigins_allmuts2)


# it takes a while,  !
vcf_sigOrigins_allmuts2_CCF <- cnaCCF_annot(vcf=vcf_sigOrigins_allmuts2,
                                        annot_data = absolute_anno_data3,
                                        cna_data = absolute_palimpsest2,CCF_boundary=0.95) 



# subset incomplete samples
vcf_sigOrigins_allmuts2_CCF2 <- vcf_sigOrigins_allmuts2_CCF[complete.cases(vcf_sigOrigins_allmuts2_CCF[,"ntot"]),]

dim(vcf_sigOrigins_allmuts2_CCF)
dim(vcf_sigOrigins_allmuts2_CCF2)
 
                                
# plots!   ###Graph per sample...avoid due to saturation  



#cnaCCF_plots(vcf=vcf_sigOrigins_allmuts2_CCF2,resdir="/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/Clonality") 
                                 

vcf_sigOrigins_allmuts2_CCF2_clonal <- vcf_sigOrigins_allmuts2_CCF2[which(vcf_sigOrigins_allmuts2_CCF2$Clonality=="clonal"),]

vcf_sigOrigins_allmuts2_CCF2_clonal2 <- droplevels(vcf_sigOrigins_allmuts2_CCF2_clonal)                                 
##Now subclonal
vcf_sigOrigins_allmuts2_CCF2_subclonal <- vcf_sigOrigins_allmuts2_CCF2[which(vcf_sigOrigins_allmuts2_CCF2$Clonality=="subclonal"),]

vcf_sigOrigins_allmuts2_CCF2_subclonal2 <- droplevels(vcf_sigOrigins_allmuts2_CCF2_subclonal)                              



#Generates plots by sample, set to off, do it by subsetting by tumor type 
signatures_exp_clonal<-list()
signatures_exp_subclonal<-list()

palimpsest_clonalitySigsCompare2<-function (clonsig = clonsig, subsig = subsig, msigcol = msigcol, 
                                            resdir = resdir,width = 18, height = 11) 
{
    clonsigp <- clonsig/apply(clonsig,1,sum)
    subsigp <- subsig/apply(subsig,1,sum)
    labs <- c("clonal","subclonal")
    pdf(file.path(resdir,"Clonal_vs_sublclonal_signature_proportions.pdf"),width=width,height=height)
    bp <- plot(-10,xlim=c(0,ncol(clonsigp)*2+12),ylim=c(-0.1,1.2),axes=F,bty="n",xlab="",ylab="Proportion of mutations",main="Clonal vs.Subclonal Signature Exposures")
    for(j in 1:ncol(clonsigp)){
        sig <- colnames(clonsigp)[j]
        ind <- which(clonsigp[,sig] > 0 | subsigp[,sig] > 0)
        tmp <- data.frame(clon=clonsigp[ind,sig],sub <- subsigp[ind,sig])
        for(i in 1:nrow(tmp)){
            points(((j-1)*2.5):((j-1)*2.5+1),tmp[i,],type="b",pch=c(19),col=msigcol[sig])
            axis(1,at=((j-1)*2.5):((j-1)*2.5+1),labels=labs,pos=0,las=2,cex.axis=1.5,tick=FALSE,cex.axis=1)
        }
        text(mean(((j-1)*2.5):((j-1)*2.5+1)),1.2,sub("Signature.","",sig),col=msigcol[sig])
    }
    axis(2,at=c(0,0.5,1),las=2)
    abline(h=0)
    dev.off()
    avclon <- apply(clonsigp,2,mean,na.rm = T);avclon
    avsub <- apply(subsigp,2,mean,na.rm = T);avsub
    pdf(file.path(resdir,"Clonal_vs_sublclonal_signature_series.pdf"),width=width,height=height)
    par(mfrow=c(1,2),xpd=NA)
    pie(avclon,col=msigcol[names(avclon)],border = msigcol[names(avclon)],labels=sub("Signature.","",names(avclon)),main = "CLONAL")
    pie(avsub,col=msigcol[names(avsub)],border = msigcol[names(avsub)],labels=sub("Signature.","",names(avsub)),main = "SUBCLONAL")
    legend(-2.25,-1.15,legend = names(msigcol),ncol = 5,fill=msigcol,border = msigcol,bty = "n",cex=1)
    dev.off()
}


for (i in 1:24) {
  ##Get only signatures detected for tumor[i]
SBS_cosmic_bytumor_clonal<-SBS_cosmic[rownames(SBS_cosmic)%in% colnames(SBS_signatures_exp_pancancer_NO_AICDA[[i]]$sig_nums),]##l

##Subset maf by tumor type 
vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp<-vcf_sigOrigins_allmuts2_CCF2_clonal2[vcf_sigOrigins_allmuts2_CCF2_clonal2$Tumor_type %in% Tumor_type[i],]
#and only NON AICDA mutations
vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp2<-vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp[vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp$SBS.Sig.max %in% setdiff(unique(vcf_sigOrigins_allmuts2_CCF2_clonal2$SBS.Sig.max),"AICDA_canonical"),]


 #Subset by tumor type
propMutsByCat.clonal <- palimpsest_input(vcf = vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp2,
                                        Type="SBS")

sig_cols_plot_semi#Asign colors, already saved previously

signatures_exp_clonal[[i]] <- deconvolution_fit(input_vcf = vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp2,
                                           input_matrices = propMutsByCat.clonal,
                                           threshold = 6,
                                           input_signatures =SBS_cosmic_bytumor_clonal ,
                                           signature_colours = sig_cols_plot_semi,
                                           doplot = F,
                                           resdir = "/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/Mutational_signatures/Semi_deconvolutionApproach/Clonality")


#####now add a column for the AICDA_canonical using the previously obtain numbers, add at $sig_nums
#Calcule number of AICDA clonal mutation per sample
##Subset to AICDA mutations
AICDA_number_tmp<-data.frame(Sample=unique(vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp$Sample), AICDA_canonical=0)

vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp3<-vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp[vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp$SBS.Sig.max %in% "AICDA_canonical",]


##Rewrite numbers
tmp<-table(as.character(vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp3$Sample))
AICDA_number_tmp$AICDA_canonical[match(names(tmp),AICDA_number_tmp$Sample)]<-tmp[]


signatures_exp_clonal[[i]]$sig_nums$AICDA_canonical<-0

#Subset to same samples
AICDA_number_tmp<-AICDA_number_tmp[AICDA_number_tmp$Sample %in% rownames(signatures_exp_clonal[[i]]$sig_nums),]
  
signatures_exp_clonal[[i]]$sig_nums$AICDA_canonical[match(AICDA_number_tmp$Sample,rownames(signatures_exp_clonal[[i]]$sig_nums))]<-AICDA_number_tmp$AICDA_canonical[AICDA_number_tmp$Sample %in% rownames(signatures_exp_clonal[[i]]$sig_nums)]

###Now recalculate each sig_props using the new mutations numbers
signatures_exp_clonal[[i]]$sig_props$AICDA_canonical<-0
signatures_exp_clonal[[i]]$sig_props<-round(signatures_exp_clonal[[i]]$sig_nums[,]/rowSums(signatures_exp_clonal[[i]]$sig_nums[,]),7)

####
##
#
###Now the subclonal

vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp<-vcf_sigOrigins_allmuts2_CCF2_subclonal2[vcf_sigOrigins_allmuts2_CCF2_subclonal2$Tumor_type %in% Tumor_type[i],]
#and only NON AICDA mutations
vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp2<-vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp[vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp$SBS.Sig.max %in% setdiff(unique(vcf_sigOrigins_allmuts2_CCF2_clonal2$SBS.Sig.max),"AICDA_canonical"),]


 #Subset by tumor type
propMutsByCat.subclonal <- palimpsest_input(vcf = vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp2,
                                        Type="SBS")

sig_cols_plot_semi#Asign colors, already saved previously

signatures_exp_subclonal[[i]] <- deconvolution_fit(input_vcf = vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp2,
                                           input_matrices = propMutsByCat.subclonal,
                                           threshold = 6,
                                           input_signatures =SBS_cosmic_bytumor_clonal ,
                                           signature_colours = sig_cols_plot_semi,
                                           doplot = F,
                                           resdir = "/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/Mutational_signatures/Semi_deconvolutionApproach/Clonality")


#####now add a column for the AICDA_canonical using the previously obtain numbers, add at $sig_nums
#Calcule number of AICDA clonal mutation per sample
##Subset to AICDA mutations
AICDA_number_tmp<-data.frame(Sample=unique(vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp$Sample), AICDA_canonical=0)

vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp3<-vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp[vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp$SBS.Sig.max %in% "AICDA_canonical",]


##Rewrite numbers
tmp<-table(as.character(vcf_sigOrigins_allmuts2_CCF2_clonal2_tmp3$Sample))
AICDA_number_tmp$AICDA_canonical[match(names(tmp),AICDA_number_tmp$Sample)]<-tmp[]


signatures_exp_subclonal[[i]]$sig_nums$AICDA_canonical<-0

#Subset to same samples
AICDA_number_tmp<-AICDA_number_tmp[AICDA_number_tmp$Sample %in% rownames(signatures_exp_subclonal[[i]]$sig_nums),]


  
signatures_exp_subclonal[[i]]$sig_nums$AICDA_canonical[match(AICDA_number_tmp$Sample,rownames(signatures_exp_subclonal[[i]]$sig_nums))]<-AICDA_number_tmp$AICDA_canonical[AICDA_number_tmp$Sample %in% rownames(signatures_exp_subclonal[[i]]$sig_nums)]




###Now recalculate each sig_props using the new mutations numbers
signatures_exp_subclonal[[i]]$sig_props$AICDA_canonical<-0
signatures_exp_subclonal[[i]]$sig_props<-round(signatures_exp_subclonal[[i]]$sig_nums[,]/rowSums(signatures_exp_subclonal[[i]]$sig_nums[,]),7)

##Subset to same samples as clonal
signatures_exp_subclonal[[i]]$sig_nums<-signatures_exp_subclonal[[i]]$sig_nums[rownames(signatures_exp_subclonal[[i]]$sig_nums)%in% rownames(signatures_exp_clonal[[i]]$sig_nums),]
signatures_exp_subclonal[[i]]$sig_props<-signatures_exp_subclonal[[i]]$sig_props[rownames(signatures_exp_subclonal[[i]]$sig_props)%in% rownames(signatures_exp_clonal[[i]]$sig_props),]


###
# Generate across the series comparisons of signature assigned to clonal and subclonal mutations

#Create directory to save files in case it does not exist
resdir.=paste0("/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/Mutational_signatures/Semi_deconvolutionApproach/Clonality/",Tumor_type[i]);if(!file.exists(resdir.)) dir.create(resdir.) 


##Aplify graph characteristics

palimpsest_clonalitySigsCompare2(clonsig = signatures_exp_clonal[[i]]$sig_nums, subsig = signatures_exp_subclonal[[i]]$sig_nums, msigcol = sig_cols_plot_semi, resdir =resdir.)
     

}

##Add clonality data to the maf file

maf_panCancer_global<-cbind(maf_panCancer_global,matrix(data = NA,nrow =nrow(maf_panCancer_global) ,ncol = ncol(vcf_sigOrigins_allmuts2_CCF2[,63:76]),dimnames = list(NULL, colnames(vcf_sigOrigins_allmuts2_CCF2[,63:76])) )) #To get the colnames but filling with NA

maf_panCancer_global[match(vcf_sigOrigins_allmuts2_CCF2$Mut_identifier,paste0(maf_panCancer_global$Tumor_Sample_Barcode,"-",maf_panCancer_global$Start_Position,"-",maf_panCancer_global$t_alt_count)),163:176]<-vcf_sigOrigins_allmuts2_CCF2[,63:76]



####################
##################Adding Neoepitope data
###



####################
##################Merging
###
###Add clonality
maf_panCancer_global_neoepitope$Clonality<-maf_panCancer_global$Clonality[match(paste0(maf_panCancer_global_neoepitope$Hugo_Symbol,maf_panCancer_global_neoepitope$Tumor_Sample_Barcode),paste0(maf_panCancer_global$Hugo_Symbol,maf_panCancer_global$Tumor_Sample_Barcode))]
##FIlter to mutations/samples having signature information..
maf_panCancer_global_neoepitope2<-maf_panCancer_global_neoepitope[!is.na(maf_panCancer_global_neoepitope$Tumor_Sample_Barcode),c(1,5:13,16,37,40:46,73,93,112,116,118,119,160,164:177)]
maf_panCancer_global_neoepitope2<-maf_panCancer_global_neoepitope2[!is.na(maf_panCancer_global_neoepitope2$Clonality),]##Clonality avaliable

###Add expression counts
colnames(RNA_counts_pan)<-design_deseq2_2$bcr_patient_barcode[match(colnames(RNA_counts_pan),design_deseq2_2$sample)]#Change names to match the maf file
###SUbset to easy coupling...
RNA_counts_pan2<-RNA_counts_pan[rownames(RNA_counts_pan)%in%maf_panCancer_global_neoepitope2$Hugo_Symbol,colnames(RNA_counts_pan)%in%maf_panCancer_global_neoepitope2$Tumor_Sample_Barcode]
maf_panCancer_global_neoepitope2$RNA_counts<-0

#maf_panCancer_global_neoepitope2$RNA_counts<-RNA_counts_pan2[match(maf_panCancer_global_neoepitope2$Hugo_Symbol,rownames(RNA_counts_pan2)),match(maf_panCancer_global_neoepitope2$Tumor_Sample_Barcode,colnames(RNA_counts_pan2))]


###Filter mutations from not expressed genes FPKM >=1 only even though Rooney et al used TMP >10

length(unique(maf_panCancer_global_neoepitope2$Tumor_Sample_Barcode))
#3370




###Add HLA specific haplotypes derived from..https://cancerimmunolres.aacrjournals.org/content/8/3/396
HLA_data_ShaoEtal <- read_excel("HLA_data_ShaoEtal.xlsx", sheet = "Table S9b", skip = 2)
HLA_data_ShaoEtal<-as.data.frame(HLA_data_ShaoEtal)
length(unique(HLA_data_ShaoEtal$ParticipantBarcode))
##COnsider there are samples with the possibility of multiple HLA haplotypes
maf_panCancer_global_neoepitope2$HLA_haplotype<-NA

#for (i in 1:nrow(maf_panCancer_global_neoepitope2)) {
  
  maf_panCancer_global_neoepitope2$HLA_haplotype[i]<-paste0(unique(HLA_data_ShaoEtal$Allele[HLA_data_ShaoEtal$ParticipantBarcode%in%maf_panCancer_global_neoepitope2$Tumor_Sample_Barcode[i]]),collapse = ",")
}
length(unique(maf_panCancer_global_neoepitope2$Tumor_Sample_Barcode[!is.na(maf_panCancer_global_neoepitope2$HLA_haplotype)]))
#Only 2272 samples
 tmp_func<-function(x,y){
  paste0(unique(y$Allele[y$ParticipantBarcode%in%x$Tumor_Sample_Barcode]),collapse = ",")
 }

tmp<-apply(maf_panCancer_global_neoepitope2,1,FUN =function(x2) tmp_func(x2, y=HLA_data_ShaoEtal))

###Export table with peptide list and calculate Prime values
maf_panCancer_global_neoepitope2$HLA_haplotype[maf_panCancer_global_neoepitope2$HLA_haplotype==""]<-NA #Replace
peptide_list<-maf_panCancer_global_neoepitope2$peptide_mut[!is.na(maf_panCancer_global_neoepitope2$HLA_haplotype)]

peptide_list<-peptide_list[-grep("X",peptide_list)]#Take out peptides containing "X" aminoacids
peptide_list<-peptide_list[-grep("0",peptide_list)]
write.table(quote = F,sep = "\t",row.names = F,col.names = F,x = peptide_list,file="/media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_mutPeptideSeq_list.txt")
##HLA names
HLA_list<-unique((maf_panCancer_global_neoepitope2$Allele))
HLA_list<-HLA_list[!is.na(HLA_list)]
HLA_list<-gsub("HLA-","",HLA_list)
HLA_list<-paste(as.character(HLA_list), collapse=",")
write.table(quote = F,sep = "\t",row.names = F,col.names = F,x = HLA_list,file="/media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_HLA_list.tab")

###
length(maf_panCancer_global_neoepitope2$peptide_mut[!is.na(maf_panCancer_global_neoepitope2$HLA_haplotype)])

##Run PRIME in bash
####################
##################Run PRIME in bash
###
cd 
./PRIME -i /media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_mutPeptideSeq_list.txt -o /media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_mutPeptideSeq_list_out.txt -a B15:01,B07:02,C16:01,C07:02,A30:01,A11:01,B39:06,C01:02,A01:01,B27:05,A30:02,A68:01,B27:02,A32:01,A33:01,B40:01,B08:01,A23:01,B52:01,C12:03,B49:01,C02:02,C05:01,A02:02,B39:01,B15:04,C12:02,B15:02,C03:03,B15:03,C08:02,C06:02,B50:01,A02:06,A02:05,B18:01,A24:03,B58:01,B13:02,B15:07,B35:02,B35:08,C15:02,B40:02,B42:01,B15:18,C07:01,B07:05,B44:06,B14:01,C03:04,B45:01,B15:17,B41:01,B53:01,B38:01,C14:02,C03:02,B37:01,B56:01,B35:17,C17:01,C08:04,B15:10,C06:17,A01:02,B18:05,B41:02,B15:39,B51:05,B15:13,B15:35,B35:05,B55:01,B82:01,B15:12,C15:05,B55:02,B15:25,B35:12,B15:08,A02:11,B42:02,B18:07,B57:03,B57:02,B47:01,B44:08,A69:01,B07:10,B44:05,B81:01,B39:10,A66:01,C08:03,B48:01,B51:08,B15:27,B13:01,B39:24,B27:04,B58:02,B51:02,B39:11,B15:11 -mix /home/user/MixMHCpred-master/MixMHCpred
######
####Melt data to some general graphs...
maf_panCancer_global_neoepitope2$Ag_count<-1

ag_melt<-maf_panCancer_global_neoepitope2[] %>% group_by(Tumor_Sample_Barcode,Tumor_type,Clonality,SBS.Sig.max) %>% dplyr::summarise(Ag_sum = sum(Ag_count))
ag_melt<-as.data.frame(ag_melt)
head(ag_melt)
dim(ag_melt)

#####Graph
Labels_tmp<-table(maf_panCancer_global_neoepitope2$Tumor_type[!duplicated(maf_panCancer_global_neoepitope2$Tumor_Sample_Barcode)])
bp <- ggplot(ag_melt[ag_melt$SBS.Sig.max%in%c("AICDA_canonical","SBS2","SBS13"),], aes(y=Ag_sum,x=Tumor_type)) +geom_boxplot(aes_string(col = "Clonality"), alpha = 0.5)+theme_std() + facet_wrap(~SBS.Sig.max,ncol = 1) + ylab("Neoepitope count")+ xlab("Tumor type")+ theme(axis.text.x = element_text(angle = 90,size=8),axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",mapping = aes(x=Tumor_type,y=Ag_sum,group=Clonality))+scale_x_discrete(labels=paste0(names(Labels_tmp)," (n=", Labels_tmp,")"))+ylim(c(0,100))
  
  
ggsave(bp,filename = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_neoAgs_clonality_bySignature.pdf",width = 14,height = 12)
##This is  not inlcuded in the manuscript..

#Other comparison
bp2<-ggplot(ag_melt[ag_melt$SBS.Sig.max%in%c("AICDA_canonical","SBS2","SBS13"),], aes(y=Ag_sum,x=Tumor_type)) +geom_boxplot(aes_string(col = "SBS.Sig.max"), alpha = 0.5)+theme_std() + facet_wrap(~Clonality,ncol = 1) + ylab("Neoepitope count")+ xlab("Tumor type")+ theme(axis.text.x = element_text(angle = 90,size=8),axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",mapping = aes(x=Tumor_type,y=Ag_sum,group=SBS.Sig.max))+scale_x_discrete(labels=paste0(names(Labels_tmp)," (n=", Labels_tmp,")"))+ylim(c(0,100))

ggsave(bp2,filename = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_neoAgs_Signatures_byClonality.pdf",width = 14,height = 12)
#

##################
##############
###Import PRIME results...

TCGA_PRIME_out <- read.delim("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_mutPeptideSeq_list_out.txt", comment.char="#")


##SUbset data to only ones containing full haplotypes
###Export table with peptide list and calculate Prime values
maf_panCancer_global_neoepitope2_prime<-maf_panCancer_global_neoepitope2[!is.na(maf_panCancer_global_neoepitope2$HLA_haplotype),]

maf_panCancer_global_neoepitope2_prime<-maf_panCancer_global_neoepitope2_prime[-grep("X",maf_panCancer_global_neoepitope2_prime$peptide_mut),]#Take out peptides containing "X" aminoacids
maf_panCancer_global_neoepitope2_prime<-maf_panCancer_global_neoepitope2_prime[-grep("0",maf_panCancer_global_neoepitope2_prime$peptide_mut),]

##Now is the same length as the PRIME output and in the same order
all(TCGA_PRIME_out$Peptide==maf_panCancer_global_neoepitope2_prime$peptide_mut) ###Sanity check, must be TRUE




#######Assign correct PRIME rank and score to each peptide depending on the patients real HLA
prime_cols<-gsub(pattern = "[.]",replacement = ":",gsub(pattern = "X.Rank_",replacement = "",colnames(TCGA_PRIME_out)[]))
maf_panCancer_global_neoepitope2_prime$Best_allele<-NA
maf_panCancer_global_neoepitope2_prime$Prime_Rank<-100
maf_panCancer_global_neoepitope2_prime$Prime_score<-NA
length(unique(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode[!is.na(maf_panCancer_global_neoepitope2_prime$HLA_haplotype)]))

for (i in 1:nrow(maf_panCancer_global_neoepitope2_prime)) {
  tmp<-strsplit(maf_panCancer_global_neoepitope2_prime$HLA_haplotype[i],",")
  tmp<-tmp[[1]] #Extract HLA haplotypes
  tmp<-gsub(pattern = "HLA-","",tmp) #Change to match prime columnames
  rank_values<- TCGA_PRIME_out[i,c(which(prime_cols%in%tmp),1)] ##Extract values, ##Add "Peptide" column In case there was only one Haplotype
  
  rank_values<-rank_values[order(rank_values)] #Put them in ascending order (lower values first)
  rank_values<-rank_values[1]
  rank_values<-ifelse(!is.null(colnames(rank_values)),gsub(pattern = "[.]",replacement = ":",gsub(pattern = "X.Rank_",replacement = "",colnames(rank_values)[])),"Not_avaliable") #Extract Haplotype
 maf_panCancer_global_neoepitope2_prime$Best_allele[i]<-rank_values
maf_panCancer_global_neoepitope2_prime$Prime_Rank[i]<-ifelse(rank_values!="Not_avaliable",TCGA_PRIME_out[i,which(prime_cols%in%rank_values)],"Not_avaliable")#Extract prime_rank of corresponding matching allele
maf_panCancer_global_neoepitope2_prime$Prime_score[i]<-ifelse(rank_values!="Not_avaliable",TCGA_PRIME_out[i,(which(prime_cols%in%rank_values)+1)],"Not_avaliable")#Extract prime_score of corresponding matching allele
  
}


# Patients with atleast one peptide encompassing the mutated residue showing a PRIME %rank score lower or equal to 0.5% with at least one of the patient’s HLA-I alleles were assigned to the group where the mutation would be immunogenic (P+ in Figure 6A). Patients with all peptides showing a PRIME %rank larger or equal to 2% with all HLA-I alleles of the patient were assigned to the group where the mutation would not be immunogenic (P). 
maf_panCancer_global_neoepitope2_prime$Prime_Rank<-as.numeric(maf_panCancer_global_neoepitope2_prime$Prime_Rank)
maf_panCancer_global_neoepitope2_prime$Prime_score<-as.numeric(maf_panCancer_global_neoepitope2_prime$Prime_score)

######Eliminate samples that are MSI...
library(BiocOncoTK)
BiocOncoTK::dingMSI
dim(BiocOncoTK::dingMSI)
dim(MSIsensor.10k)
head(MSIsensor.10k)
table(BiocOncoTK::dingMSI$MSIsensor.score)
table(BiocOncoTK::dingMSI$MSIsensor.score>4)
table(BiocOncoTK::dingMSI$MSIsensor.score>=4)
maf_panCancer_global_neoepitope2_prime$Ding_MSI_score<-dingMSI$MSIsensor.score[match(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode,dingMSI$participant_barcode)]
table(maf_panCancer_global_neoepitope2_prime$Ding_MSI_score>=4)
head(maf_panCancer_global_neoepitope2_prime$Ding_MSI_score)
head(dingMSI[match(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode,dingMSI$participant_barcode),])
maf_panCancer_global_neoepitope2_prime$Ding_MSI_status<-ifelse(maf_panCancer_global_neoepitope2_prime$Ding_MSI_score<4,"MSI-L","MSI-H")

table(maf_panCancer_global_neoepitope2_prime$Ding_MSI_status)

length(unique(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode[!is.na(maf_panCancer_global_neoepitope2_prime$HLA_haplotype)&maf_panCancer_global_neoepitope2_prime$Ding_MSI_status=="MSI-L"]))
[1] 2224
length(unique(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode[!is.na(maf_panCancer_global_neoepitope2_prime$HLA_haplotype)]))
[1] 2272

#THere are 48 MSI samples
maf_panCancer_global_neoepitope2_prime_all<-maf_panCancer_global_neoepitope2_prime
maf_panCancer_global_neoepitope2_prime<-maf_panCancer_global_neoepitope2_prime_all[maf_panCancer_global_neoepitope2_prime_all$Ding_MSI_status=="MSI-L",]

#####
##
###Filter out samples with altered antigen-presentation
length(unique(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode))
#2224 patients
###Get samples
damage_tmp<-names(table(maf_panCancer_global$PolyPhen))[449:1002]
damage_tmp2<-names(table(maf_panCancer_global$SIFT))[2:12]
disrupt_samples<-unique(maf_panCancer_global$Tumor_Sample_Barcode[maf_panCancer_global$Hugo_Symbol%in%c("HLA-A","HLA-B","HLA-C","CIITA","IRF1","PSME1","PSME2","PSME3","ERAP1","ERAP2","HSPA","HSPC","TAP1","TAP2","TAPBP","CALR","CNX","PDIA3","B2M")&(maf_panCancer_global$PolyPhen%in%damage_tmp|maf_panCancer_global$PolyPhen%in%damage_tmp2|maf_panCancer_global$ntot<2)])
length(disrupt_samples)
#[1] 444

###SUbset clinical data to these samples (containing survival info)
TCGA_clinical2_neopeptides<-TCGA_clinical2[TCGA_clinical2$bcr_patient_barcode%in%unique(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode),] #THe ones with prime info...
TCGA_clinical2_neopeptides<-TCGA_clinical2_neopeptides[TCGA_clinical2_neopeptides$bcr_patient_barcode%nin%disrupt_samples,] #take out disrupted samples
dim(TCGA_clinical2_neopeptides)
#[1] 2146   61
table(TCGA_clinical2_neopeptides$type)
#There are only 3 COAD samples, too low for survival analysis, take out
TCGA_clinical2_neopeptides<-TCGA_clinical2_neopeptides[TCGA_clinical2_neopeptides$type%nin%"COAD",]

###Subset maf_panCancer_global_neoepitope2_prime to these samples

maf_panCancer_global_neoepitope2_prime<-maf_panCancer_global_neoepitope2_prime[maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode%in%TCGA_clinical2_neopeptides$bcr_patient_barcode,]

#############


##Porcentage of neopitopes potentially immunogenic
length(maf_panCancer_global_neoepitope2_prime$Prime_Rank[maf_panCancer_global_neoepitope2_prime$Prime_Rank<=0.5])/nrow(maf_panCancer_global_neoepitope2_prime)

######PLOT....

Labels_tmp<-table(maf_panCancer_global_neoepitope2_prime$Tumor_type[!duplicated(maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode)])



ggplot(maf_panCancer_global_neoepitope2_prime[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max%in%c("AICDA_canonical","SBS2","SBS13"),], aes(y=-log10(Prime_Rank),x=Tumor_type)) +geom_boxplot(aes_string(col = "Clonality"), alpha = 0.5)+theme_std() + facet_wrap(~SBS.Sig.max,ncol = 1) + ylab("-Log10(Prime Rank)")+ xlab("Tumor type")+ theme(axis.text.x = element_text(angle = 90,size=8),axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",mapping = aes(x=Tumor_type,y=Prime_Rank,group=Clonality))+scale_x_discrete(labels=paste0(names(Labels_tmp)," (n=", Labels_tmp,")"))+ylim(c(-2,2))
  



compare_means(Prime_Rank ~ Clonality, data = maf_panCancer_global_neoepitope2_prime[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max%in%c("AICDA_canonical","SBS2","SBS13"),],group.by = "SBS.Sig.max")
###Significant only for AICDA
maf_panCancer_global_neoepitope2_prime$Immunogenic_Prime<-ifelse(maf_panCancer_global_neoepitope2_prime$Prime_Rank<=0.5,"Immunogenic","Non-Immunogenic")

compare_means(Prime_Rank ~ SBS.Sig.max_reduced, data = maf_panCancer_global_neoepitope2_prime[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("AICDA_canonical","SBS2","SBS13")&maf_panCancer_global_neoepitope2_prime$Clonality=="clonal"&maf_panCancer_global_neoepitope2_prime$Immunogenic_Prime=="Immunogenic",])
#No difference


###Difficult to visualize, plot number of Immunogenic peptides instead of RANK

####Melt data to some general graphs...
maf_panCancer_global_neoepitope2_prime$Ag_count<-1

maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced<-maf_panCancer_global_neoepitope2_prime$SBS.Sig.max ##Reduce signatures to plot
#MMR signatures 6, 15, 20, 21, and 26; Smoking-associated signatures 4, 18, 24, and 29; POLE (signatures 10 or 14), TMZ (signature 11)
maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("SBS6","SBS15","SBS20","SBS21","SBS26","SBS44")]<-"MMR"
maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("SBS4","SBS18","SBS24","SBS29")]<-"Smoking-associated"
maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("SBS10a","SBS10b","SBS14")]<-"POLE"
maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("SBS11")]<-"TMZ"
maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("SBS7a","SBS7b")]<-"UV"
maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("SBS31","SBS35")]<-"Platinium treatment"

maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced[maf_panCancer_global_neoepitope2_prime$SBS.Sig.max_reduced%in%c("SBS8","SBS16","SBS37","SBS28","SBS12")]<-"Other"


ag_melt<-maf_panCancer_global_neoepitope2_prime[] %>% group_by(Tumor_Sample_Barcode,Tumor_type,Clonality,SBS.Sig.max_reduced,Immunogenic_Prime) %>% dplyr::summarise(Ag_sum = sum(Ag_count))
ag_melt<-as.data.frame(ag_melt)
head(ag_melt)
dim(ag_melt)

tmp<-compare_means(Ag_sum ~ Clonality, data = ag_melt[ag_melt$Immunogenic_Prime%in%"Immunogenic",],group.by = "SBS.Sig.max_reduced") #Extract
tmp<-as.data.frame(tmp)
sig_signatures<-tmp$SBS.Sig.max_reduced[tmp$p<0.05]
bp3<-ggplot(ag_melt[!is.na(ag_melt$SBS.Sig.max_reduced)&ag_melt$Immunogenic_Prime%in%"Immunogenic"&ag_melt$SBS.Sig.max_reduced%in%c(sig_signatures),], aes(y=Ag_sum,x=Tumor_type)) +geom_boxplot(aes_string(col = "Clonality"), alpha = 0.5)+theme_std() + facet_wrap(~SBS.Sig.max_reduced,ncol = 2) + ylab("Number of Immunogenic neopeptides")+ xlab("Tumor type")+ theme(axis.text.x = element_text(angle = 90,size=8),axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",mapping = aes(x=Tumor_type,y=Ag_sum,group=Clonality))+scale_x_discrete(labels=paste0(names(Labels_tmp)," (n=", Labels_tmp,")"))+ylim(c(0,150))
  
ggsave(bp3,filename = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_neoAgs_Signatures_byClonality_immunogenic_SUP3.pdf",width = 16,height = 14)
#This is  Supplementary Figure 22A

##In general there are more clonal immunogenic peptides independently of the signature but it is significantly different according to the tumor type
my_comparisons <- list( c("AICDA_canonical", "SBS13"), c("AICDA_canonical", "Smoking-associated"), c("AICDA_canonical", "SBS3"),c("AICDA_canonical", "UV") )
bp3<-ggplot(ag_melt[!is.na(ag_melt$SBS.Sig.max_reduced)&ag_melt$Immunogenic_Prime%in%"Immunogenic"&ag_melt$SBS.Sig.max_reduced%in%c("AICDA_canonical","SBS2","SBS13","SBS3","Smoking-associated","UV")&ag_melt$Clonality=="clonal",], aes(y=Ag_sum,x=SBS.Sig.max_reduced)) +geom_boxplot(aes_string(col = "SBS.Sig.max_reduced"), alpha = 0.5)+theme_std() + facet_wrap(~Tumor_type,ncol = 2) + ylab("Number of Immunogenic neopeptides")+ xlab("Signature")+ theme(axis.text.x = element_text(angle = 90,size=8),axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",comparisons = my_comparisons)
  
ggsave(bp3,filename = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/TCGA_neoAgs__immunogenic_byTumor_signaturesCOmparison_SUP4.pdf",width = 20,height = 18)

#This is  Supplementary Figure 22B



#############
############Obtain data for Figure 6A
###
s_tab<-compare_means(Ag_sum ~ SBS.Sig.max_reduced, data = ag_melt[!is.na(ag_melt$SBS.Sig.max_reduced)&ag_melt$Immunogenic_Prime%in%"Immunogenic"&ag_melt$SBS.Sig.max_reduced%in%c("AICDA_canonical","SBS2","SBS13","SBS3","Smoking-associated","UV")&ag_melt$Clonality=="clonal",],group.by = "Tumor_type")


#####
###
###Plot... using all antigens across all tumors, barplot the number of immunogenic vs non immunogenic in clonal and subclonal mutations
res <- as.data.frame.matrix(xtabs(~Clonality +Immunogenic_Prime , data=ag_melt[])) ##Number per immunogenic

role_order <- c('subclonal','clonal')
pval <- fisher.test(res)$p.value
pval_label <-paste0('p ~',prettyNum(pval,digits=1))
res$Role<-rownames(res)
res$Role <- factor(res$Role, levels=role_order)
res <- res[order(res$Role),]
res2 <- as.data.table(res)

res2[,Role:=NULL]
res2 <- res2 / rowSums(res2)
res2 <- cbind(Role=role_order,res2)
dat2 <- melt(res2,id.var='Role')
dat2$Role <- factor(dat2$Role,levels=role_order)
dat2$value <- 100*dat2$value
ci <- binom.confint(res$Immunogenic,(res$`Non-Immunogenic` + res$Immunogenic),method='exact')
ci$Role <- res2$Role
ci$Role <- factor(ci$Role, levels=role_order)
setnames(ci,'mean','value')
ci$value <- 100*ci$value
ci$lower <- 100*ci$lower
ci$upper <- 100*ci$upper
ci$Role <- factor(ci$Role, levels=c('clonal','subclonal'))
cols <- c('#2D92BA','#B7DAEA')
names(cols) <- colnames(res)[1:2]
dat2$variable<- factor(dat2$variable,levels=rev(colnames(res)[1:2]))

####
#########
##############
####Cumulative curves...to see the distribution of immunogenic neoantigens regarding the signature using all tumors

d_mutations<-maf_panCancer_global_neoepitope2_prime[,c('Tumor_Sample_Barcode','Hugo_Symbol',            'Variant_Classification','Variant_Type','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2',
                                   'Start_Position','End_Position',"AICDA_mutation","Clonality","SBS.Sig.max_reduced","peptide_mut","Immunogenic_Prime")]


d_mutations<-as.data.table(d_mutations)#Transform to data.table

### load MAF, subset for hotspot SNPs among valid samples
d <- d_mutations  #This already includes the AICDA tag

dd <- d[Variant_Classification %in% 'Missense_Mutation',]

### annotate if the sample-gene is singleton or composite
dd$id <- paste0(dd$Tumor_Sample_Barcode,dd$Hugo_Symbol)
dd$pid_peptide <- paste0(dd$id,dd$peptide_mut)
total_number_neopeptides<- length(unique(dd$pid_peptide))
#####Get the total number of pepties for each category that wants to be plotted...
##
#Non-immunogenic (both clonal and subclonal)
total_number_noImmunogenic_neopeptides <- length(unique(dd$pid_peptide[dd$Immunogenic_Prime=="Non-Immunogenic"]))
#SUbclonal immunogenic
total_number_Subclonal_immunogenic_neopeptides <- length(unique(dd$pid_peptide[dd$Immunogenic_Prime=="Immunogenic"&dd$Clonality=="subclonal"]))
#Clonal immunogenic AICDA
total_number_clonal_immuno_AID <- length(unique(dd$pid_peptide[dd$Immunogenic_Prime=="Immunogenic"&dd$Clonality=="clonal"&dd$SBS.Sig.max_reduced=="AICDA_canonical"]))#
#Clonal immunogenic APOBEC (SBS2, SBS13)
total_number_clonal_immuno_APOBEC <- length(unique(dd$pid_peptide[dd$Immunogenic_Prime=="Immunogenic"&dd$Clonality=="clonal"&dd$SBS.Sig.max_reduced%in%c("SBS2","SBS13")]))###
##Clonal immunogenic other
total_number_clonal_immuno_Other <- length(unique(dd$pid_peptide[dd$Immunogenic_Prime=="Immunogenic"&dd$Clonality=="clonal"&dd$SBS.Sig.max_reduced%nin%c("SBS2","SBS13","AICDA_canonical")]))
#Total clonal iimunogenic
total_number_peptides<-length(unique(dd$pid_peptide))


### for each hotspot, count the number patients with  neopeptides ocurring at that "position" due to each category

summarize_individual_hotspot3 <- function(d) {
  ### count affected singleton/composite samples
  neopeptide_samples <- sortunique(d$pid_peptide)
  #Non-immunogenic (both clonal and subclonal)
noImmunogenic_samples <- sortunique(d$pid_peptide[d$Immunogenic_Prime=="Non-Immunogenic"])
#SUbclonal immunogenic
Subclonal_immunogenic_samples <- sortunique(d$pid_peptide[d$Immunogenic_Prime=="Immunogenic"&d$Clonality=="subclonal"])
#Clonal immunogenic AICDA
Clonal_immuno_AID_samples <- sortunique(d$pid_peptide[d$Immunogenic_Prime=="Immunogenic"&d$Clonality=="clonal"&d$SBS.Sig.max_reduced=="AICDA_canonical"])
#Clonal immunogenic APOBEC (SBS2, SBS13)
Clonal_immuno_APOBEC_samples <- sortunique(d$pid_peptide[d$Immunogenic_Prime=="Immunogenic"&d$Clonality=="clonal"&d$SBS.Sig.max_reduced%in%c("SBS2","SBS13")])
##Clonal immunogenic other
Clonal_immuno_Other_samples <- sortunique(d$pid_peptide[d$Immunogenic_Prime=="Immunogenic"&d$Clonality=="clonal"&d$SBS.Sig.max_reduced%nin%c("SBS2","SBS13","AICDA_canonical")])
  
  ### don't overcount patients with 2+ samples
  #mutant_samples <- unique(strtrim(mutant_samples,9))
  n.neopeptides <- length(neopeptide_samples)
  n.noImmunogenic <- length(noImmunogenic_samples)
  n.subclonalImmunogenic <- length(Subclonal_immunogenic_samples)
  n.ClonalAIDImmuno <- length(Clonal_immuno_AID_samples)
  n.ClonalAPOBECImmuno <- length(Clonal_immuno_APOBEC_samples)
  n.ClonalOtherCImmuno <- length(Clonal_immuno_Other_samples)
  list(n.neopeptides=n.neopeptides,n.noImmunogenic=n.noImmunogenic,n.subclonalImmunogenic=n.subclonalImmunogenic,n.ClonalAIDImmuno=n.ClonalAIDImmuno,n.ClonalAPOBECImmuno=n.ClonalAPOBECImmuno,n.ClonalOtherCImmuno=n.ClonalOtherCImmuno)
} #Redefine function for AICDA calculation...

###This is for AICDA hotspots and global at the same time!!! #HGVSp_Short
dd$hotspotid<-paste0(dd$Hugo_Symbol," ",gsub('.{1}$',"",gsub(pattern = "p.",replacement = "",x = dd$HGVSp_Short))) #Hotspot without aminoacid change, just the position (this is because the AA change is mostly related to the signature)
hotspots <- sortunique(dd$hotspotid)
hotspots <- hotspots[hotspots!='']

ll <- dd[hotspotid!='',summarize_individual_hotspot3(.SD),by=hotspotid]
##order
ll <- ll[order(n.neopeptides,decreasing=T),]

###This is the order according to the most frequent neopeptide originating from a specific residue 

###Get the most frequent mutated hotspots using all mutations (not only the ones producing neopeptides) from the orginal maf file but subsetting to include the same samples (n = 2,224)
load("/media/user/seagate_ICM/AICDA_analysis/maf_pancancer_CFF_clonality.RData")
maf_prime_allmuts<-maf_panCancer_global[maf_panCancer_global$Tumor_Sample_Barcode%in%maf_panCancer_global_neoepitope2_prime$Tumor_Sample_Barcode,]

tmp_d<-maf_prime_allmuts[,c('Tumor_Sample_Barcode','Hugo_Symbol',            'Variant_Classification','Variant_Type','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2',
                                   'Start_Position','End_Position',"AICDA_mutation")]
tmp_d<-as.data.table(tmp_d)
summarize_individual_hotspot4 <- function(d) {
  ### count affected singleton/composite samples
  mutant_samples <- sortunique(d$Tumor_Sample_Barcode)
  n.mutant <- length(mutant_samples)
  list(n.mutant=n.mutant)
}
tmp_d$hotspotid<-paste0(tmp_d$Hugo_Symbol," ",gsub('.{1}$',"",gsub(pattern = "p.",replacement = "",x = tmp_d$HGVSp_Short))) #Hotspot without aminoacid change, just the position (this is because the AA change is mostly related to the signature)
tmp_hotspots <- sortunique(tmp_d$hotspotid)
tmp_hotspots <- tmp_hotspots[tmp_hotspots!='']

tmp_ll <- tmp_d[hotspotid!='',summarize_individual_hotspot4(.SD),by=hotspotid]
##order
tmp_ll <- tmp_ll[order(n.mutant,decreasing=T),]
###Order the hotspots neopeptides according to these frequency (top mutated hotspot residues...)
tmp_ll$x.order <- 1:nrow(tmp_ll)
########
#####Now add the data of the neoPeptides
ll_merged<-merge(tmp_ll,ll,by="hotspotid",all.x=TRUE) #Merge filling with NA if no peptides at that residue
ll_merged <- ll_merged[order(x.order,decreasing=F),] #Re-order
ll_merged[is.na(ll_merged)]<-0

total_number_noImmunogenic_neopeptides<-sum(ll_merged$n.noImmunogenic,na.rm = T)
total_number_clonal_immuno_Other <-sum(ll_merged$n.ClonalOtherCImmuno,na.rm = T)

ll_merged$p.noImmunogenic <- cumsum(ll_merged$n.noImmunogenic / total_number_noImmunogenic_neopeptides)
ll_merged$p.subclonalImmunogenic <- cumsum(ll_merged$n.subclonalImmunogenic / total_number_Subclonal_immunogenic_neopeptides)
ll_merged$p.ClonalAIDImmuno <- cumsum(ll_merged$n.ClonalAIDImmuno / total_number_clonal_immuno_AID)
ll_merged$p.ClonalAPOBECImmuno <- cumsum(ll_merged$n.ClonalAPOBECImmuno / total_number_clonal_immuno_APOBEC)
ll_merged$p.ClonalOtherCImmuno <- cumsum(ll_merged$n.ClonalOtherCImmuno / total_number_clonal_immuno_Other)
ll_merged$p.neopeptides <- cumsum(ll_merged$n.neopeptides / total_number_peptides)


setnames(ll_merged,'hotspotid','hotspot')

##Overall probabilities
 max(ll_merged$p.noImmunogenic,na.rm = T)
  
 max(ll_merged$p.subclonalImmunogenic,na.rm = T)
  
#AICDA
 max(ll_merged$p.ClonalAIDImmuno,na.rm = T)
 
 max(ll_merged$p.ClonalAPOBECImmuno,na.rm = T)
 
  max(ll_merged$p.ClonalOtherCImmuno,na.rm = T)


###For inset on the right..
###Plot... using all antigens across all tumors, barplot the number of immunogenic vs non immunogenic in clonal and subclonal mutations
res3 <-rbind(table(TCGA_clinical2_neopeptides$Clonal_immuno_AICDA_binary),table(TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC_binary)) ##Number per immunogenic
rownames(res3)<-c("AICDA","APOBEC")
role_order <- c("AICDA","APOBEC")
res3<-as.data.frame(res3)
pval <- fisher.test(res3)$p.value
pval_label <-paste0('p ~',prettyNum(pval,digits=1))
res3$Role<-rownames(res3)
res3$Role <- factor(res3$Role, levels=role_order)
res3 <- res3[order(res3$Role),]
res2 <- as.data.table(res3)

res2[,Role:=NULL]
res2 <- res2 / rowSums(res2)
res2 <- cbind(Role=role_order,res2)
dat2 <- melt(res2,id.var='Role')
dat2$Role <- factor(dat2$Role,levels=role_order)
dat2$value <- 100*dat2$value
ci <- binom.confint(res3$Presence,(res3$Presence + res3$Absence),method='exact')
ci$Role <- res2$Role
ci$Role <- factor(ci$Role, levels=role_order)
setnames(ci,'mean','value')
ci$value <- 100*ci$value
ci$lower <- 100*ci$lower
ci$upper <- 100*ci$upper
ci$Role <- factor(ci$Role, levels=c('APOBEC','AICDA'))
cols <- c('#2D92BA','#B7DAEA')
names(cols) <- rev(colnames(res3)[1:2])
dat2$variable<- factor(dat2$variable,levels=(colnames(res3)[1:2]))

####
#######Note: Create supplementary table comparing per tumor type..
#Table for the tests
tmp<-cbind(as.data.frame.matrix(xtabs(~type +Clonal_immuno_AICDA_binary , data=TCGA_clinical2_neopeptides[])),
as.data.frame.matrix(xtabs(~type +Clonal_immuno_APOBEC_binary , data=TCGA_clinical2_neopeptides[])))
colnames(tmp)<-c("AICDA_Absence","AICDA_Presence","APOBEC_Absence","APOBEC_Presence")

##Perform fisher test
tmp$Fisher_pval<-apply(tmp,1,function(x){
   test<-rbind(c(x["AICDA_Absence"],x["AICDA_Presence"]),c(x["APOBEC_Absence"],x["APOBEC_Presence"]))
   pval <- fisher.test(test)$p.value
    return(pval)#can also be return
})

write.table(tmp,file = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/AICDA_vs_APOBEC_ICNs_perTumor.tab",quote = F,sep = "\t")
###Supplementary table for comparisons...

res_tmp<-cbind(as.data.frame.matrix(xtabs(~type +Clonal_immuno_AICDA_binary , data=TCGA_clinical2_neopeptides[]))/rowSums(as.data.frame.matrix(xtabs(~type +Clonal_immuno_AICDA_binary , data=TCGA_clinical2_neopeptides[])))*100,
as.data.frame.matrix(xtabs(~type +Clonal_immuno_APOBEC_binary , data=TCGA_clinical2_neopeptides[]))/rowSums(as.data.frame.matrix(xtabs(~type +Clonal_immuno_APOBEC_binary , data=TCGA_clinical2_neopeptides[])))*100)
##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")
#Import data
##Import data
ci<-as.data.table(read.delim("Data/Figure6/Figure6A_1A.tab"))
dat2<-as.data.table(read.delim("Data/Figure6/Figure6A_1B.tab"))
ci$Role <- factor(ci$Role, levels=c('clonal','subclonal'))
dat2$Role <- factor(dat2$Role, levels=c('clonal','subclonal'))
##left inset plot from Figure 6A
dat2$variable <- factor(dat2$variable,levels=(c("Non-Immunogenic", "Immunogenic")))

cols <- c('#2D92BA','#B7DAEA')
names(cols) <- rev(levels(dat2$variable))

pval_label<- "p ~1e-27" #From calculations
  
p1 <- ggplot(dat2, aes(x=Role,y=value)) +
  geom_bar(stat='identity',aes(fill=variable)) +
  geom_signif(comparisons=list(c("clonal", "subclonal")), y_position =105, tip_length = 0, annotations=pval_label) + 
  scale_fill_manual(values=cols,name='Immunogenecity') +
  geom_errorbar(data=ci,aes(min=lower,max=value),color='white',width=0) +
  geom_errorbar(data=ci,aes(min=value,max=upper),color='black',width=0) +
  scale_y_continuous(expand=c(0,0),limits=c(0,115),breaks=seq(0,100,by=25),position='right') +
  labs(x=NULL,y='% of all neopeptides') +
  theme_std(base_size=14) +
  theme(legend.position='bottom') + #axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
  coord_flip()


##Main Figure from the panel 6A

#####Import data

ll_merged<-as.data.table(read.delim("Data/Figure6/Figure6A_2A.tab"))
total_number_clonal_immuno_APOBEC<-5154 #prev.calculated
total_number_clonal_immuno_AID<-2177  #Prev calculated

### Calculate P-value with proportion testing at mid point
###AICDA vs APOBEC
AID_summary <- data.table(x=median(ll_merged$p.ClonalAIDImmuno,na.rm = T)*total_number_clonal_immuno_AID,n=total_number_clonal_immuno_AID,class='AID') #x= the sum of the number of different samples having a neopeptide at each x hotspot; n= the total of neopeptides

APOBEC_summary <- data.table(x=median(ll_merged$p.ClonalAPOBECImmuno,na.rm = T)*total_number_clonal_immuno_APOBEC,n=total_number_clonal_immuno_APOBEC,class='APOBEC')
summary <- rbind(AID_summary, APOBEC_summary)
pval <- prop.test(summary$x, summary$n)$p.value
pval_lab <- paste0('P~',prettyNum(pval,digits=1)) ##


## prep data for plot
toadd <- data.table(hotspot='none',n.mutant=NA,n.neopeptides=NA,n.noImmunogenic=NA,n.subclonalImmunogenic=NA,n.ClonalAIDImmuno=NA,n.ClonalAPOBECImmuno=NA,n.ClonalOtherCImmuno=NA,x.order=0,p.noImmunogenic=0,p.subclonalImmunogenic=0,p.ClonalAIDImmuno=0,p.ClonalAPOBECImmuno=0,p.ClonalOtherCImmuno=0,p.neopeptides=0)
pdat2 <- rbind(toadd,ll_merged)
pdat2 <- melt(pdat2[,c('hotspot','x.order','p.noImmunogenic','p.subclonalImmunogenic',"p.ClonalAIDImmuno","p.ClonalAPOBECImmuno","p.ClonalOtherCImmuno"),with=F],id.var=c('x.order','hotspot'))


pdat2$Class <- ''
pdat2[variable=='p.noImmunogenic',Class:='All non-Immunogenic']
pdat2[variable=='p.subclonalImmunogenic',Class:='Subclonal Immunogenic']
pdat2[variable=='p.ClonalAIDImmuno',Class:='Clonal Immunogenic AICDA']
pdat2[variable=='p.ClonalAPOBECImmuno',Class:='Clonal Immunogenic APOBEC']
pdat2[variable=='p.ClonalOtherCImmuno',Class:='Clonal Immunogenic Other']



pdat2$Class <- factor(pdat2$Class,levels=c('All non-Immunogenic','Subclonal Immunogenic',"Clonal Immunogenic AICDA","Clonal Immunogenic APOBEC","Clonal Immunogenic Other"))

cols <- c('#979797',"black","darkred",'#1B89AD',"green")
names(cols) <- levels(pdat2$Class)


## plot
##FOld change at mid point
FC1<-round(median(ll_merged$p.ClonalAIDImmuno,na.rm = T)/median(ll_merged$p.ClonalAPOBECImmuno,na.rm = T),2)
#Fold change

p2_ags <- ggplot(pdat2[pdat2$Class%in%c("Clonal Immunogenic AICDA","Clonal Immunogenic APOBEC"),],aes(x=x.order)) +
  geom_line(aes(x=x.order,y=value,color=Class),size=1) +
  geom_text(label=paste0(pval_lab,"; FC=",FC1),data=pdat2[1,],aes(x=median(pdat2$x.order),y=median(ll_merged$p.ClonalAPOBECImmuno,na.rm = T)+((median(ll_merged$p.ClonalAIDImmuno,na.rm = T)-median(ll_merged$p.ClonalAPOBECImmuno,na.rm = T))/2))) +labs(x='Hotspot residue number  \n(more frequent<----------------------->less frequent)',y='% of all peptides (within each origin)') + 
  scale_y_continuous(expand=c(0,0),labels=scales::percent,limits=c(-.1,1)) +
  scale_color_manual(values=cols,name="NeoPeptide origin",) +
  theme_std(base_size=14)+ scale_x_continuous(labels = c("0","50k","100k","150k","200k","250k"))+
  theme(legend.position='bottom')+geom_label_repel(data=pdat2[c(2,30),],position = position_dodge(width=0.9),aes(x = x.order, y = value+.01,label=hotspot), label.padding = unit(0.15, "lines"),segment.size = 0.1,hjust = 2,point.padding = 0.2,box.padding= unit(0.5, "lines"),label.size = 0.1,min.segment.length = 0,inherit.aes = F, show.legend = F,ylim = 0.03)+geom_vline(xintercept =median(pdat2$x.order),color='black',size=0.5,linetype='dashed')





###Right inset plot from panel 6A

##Import data
ci<-as.data.table(read.delim("Data/Figure6/Figure6A_3A.tab"))
dat2<-as.data.table(read.delim("Data/Figure6/Figure6A_3B.tab"))


##left inset plot from Figure 6A

ci$Role <- factor(ci$Role, levels=c('APOBEC','AICDA'))
dat2$Role <- factor(dat2$Role, levels=c('APOBEC','AICDA'))
dat2$variable <- factor(dat2$variable,levels=(c("Absence", "Presence")))

cols <- c('#2D92BA','#B7DAEA')
names(cols) <- rev(levels(dat2$variable))



pval_label<- "p ~1e-65" #From calculations

p1_sam <- ggplot(dat2, aes(x=Role,y=value)) +
  geom_bar(stat='identity',aes(fill=variable)) +
  geom_signif(comparisons=list(c("AICDA", "APOBEC")), y_position =105, tip_length = 0, annotations=pval_label) + 
  scale_fill_manual(values=cols,name='Clonal immunogenic neopeptides') +
  geom_errorbar(data=ci,aes(min=lower,max=value),color='white',width=0) +
  geom_errorbar(data=ci,aes(min=value,max=upper),color='black',width=0) +
  scale_y_continuous(expand=c(0,0),limits=c(0,115),breaks=seq(0,100,by=25),position='right') +
  labs(x=NULL,y='% of all samples') +
  theme_std(base_size=14) +
  theme(legend.position='bottom') + #axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
  coord_flip()


###Plot Together

####Create plot to graph the total number of mutations on x
p1_1<-plot_grid(plotlist=list(p1,NULL,p1_sam), align='v', ncol=3, nrow=1,rel_widths = c(1,.2,1))

p2_ags_3= grid.grabExpr({
    print(plot_grid(plotlist=list(p1_1,p2_ags), align='v', ncol=1, nrow=2, rel_heights=c(2,6),rel_widths = c(.75,2)))
})

col_fun2 = colorRamp2(c(0, 2,10, 10+1e-6, 25,100), c("white", "lightskyblue", "#023FA5","yellow","goldenrod","red"))
lgd = Legend(col_fun = col_fun2, title = "# Distinct mutations",at =c(0,2,10,25,75,100),break_dist = c(2, 3,2,1,1), legend_height = unit(4, "cm") )



ha = HeatmapAnnotation(foo = ll_merged$n.mutant[c(1:5600)], simple_anno_size = unit(1, "cm"),col = list(foo = col_fun2),annotation_label = c(""),annotation_legend_param = list(
        foo = list(
                title = "# Distinct mutations",
                at = c(0,2,10,25,75,100),
                labels = c(0,2,10,25,75,100)
            )))


###Draw main
grid.newpage()
grid.draw(p2_ags_3)
pushViewport(viewport(width = 0.84, height = 1,just = c("bottom")))
draw(ha,x = unit(0.01, "npc"), y = unit(-.36, "npc"), just = c("left", "bottom"))
pushViewport(viewport(width = 0.84, height = 1,just = c("bottom")))
draw(lgd,x = unit(1.2, "npc"), y = unit(-.8, "npc"), just = c("right", "bottom"))

p_neoAgs2<-recordPlot()

##Save 10.5 X 18,TCGA_neoAgs__FigA2.pdf
p_neoAgs2


Fig 6B

##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")

##Add to the clinical the total number of clonal immunogenic AICDA/APOBEC provoked neopeptides for each sample..
#For AICDA
ag_melt2<-ag_melt[ag_melt$Tumor_Sample_Barcode%in%TCGA_clinical2_neopeptides$bcr_patient_barcode&ag_melt$SBS.Sig.max_reduced%in%"AICDA_canonical"&ag_melt$Clonality=="clonal"&ag_melt$Immunogenic_Prime=="Immunogenic",]
TCGA_clinical2_neopeptides$Clonal_immuno_AICDA<-ag_melt2$Ag_sum[match(TCGA_clinical2_neopeptides$bcr_patient_barcode,ag_melt2$Tumor_Sample_Barcode)]
TCGA_clinical2_neopeptides$Clonal_immuno_AICDA[is.na(TCGA_clinical2_neopeptides$Clonal_immuno_AICDA)]<-0
#FOR APOBEC
ag_melt2<-ag_melt[ag_melt$Tumor_Sample_Barcode%in%TCGA_clinical2_neopeptides$bcr_patient_barcode&ag_melt$SBS.Sig.max_reduced%in%c("SBS2")&ag_melt$Clonality=="clonal"&ag_melt$Immunogenic_Prime=="Immunogenic",]
ag_melt3<-ag_melt[ag_melt$Tumor_Sample_Barcode%in%TCGA_clinical2_neopeptides$bcr_patient_barcode&ag_melt$SBS.Sig.max_reduced%in%c("SBS13")&ag_melt$Clonality=="clonal"&ag_melt$Immunogenic_Prime=="Immunogenic",]
TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC<-ag_melt2$Ag_sum[match(TCGA_clinical2_neopeptides$bcr_patient_barcode,ag_melt2$Tumor_Sample_Barcode)]+ag_melt3$Ag_sum[match(TCGA_clinical2_neopeptides$bcr_patient_barcode,ag_melt3$Tumor_Sample_Barcode)]
TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC[is.na(TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC)]<-0

###Transform to binary if present or absent
TCGA_clinical2_neopeptides$Clonal_immuno_AICDA_binary<-ifelse(TCGA_clinical2_neopeptides$Clonal_immuno_AICDA>0,"Presence","Absence")#AICDA
TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC_binary<-ifelse(TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC>0,"Presence","Absence")#APOBEC
TCGA_clinical2_neopeptides$Clonal_immuno_AICDA_binary <- factor(TCGA_clinical2_neopeptides$Clonal_immuno_AICDA_binary,levels=c("Absence","Presence")) #To take Absence as reference in the model
TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC_binary <- factor(TCGA_clinical2_neopeptides$Clonal_immuno_APOBEC_binary,levels=c("Absence","Presence"))
#####
summary(coxph(Surv(OS.time, OS) ~ Clonal_immuno_AICDA_binary , data = TCGA_clinical2_neopeptides))
#Significant  better response if not having the neopeptides! ...


summary(coxph(Surv(OS.time, OS) ~ Clonal_immuno_APOBEC_binary , data = TCGA_clinical2_neopeptides))
#Not significant
TCGA_clinical2_neopeptides$Age_binary<-ifelse(TCGA_clinical2_neopeptides$age_at_initial_pathologic_diagnosis>=median(TCGA_clinical2_neopeptides$age_at_initial_pathologic_diagnosis,na.rm = T),1,0) #Cut_off median age
summary(coxph(Surv(OS.time, OS) ~ Clonal_immuno_AICDA_binary + Age_binary, data = TCGA_clinical2_neopeptides)) #Not significant but close...


####Add immune variables
#Download data from: https://cavei.github.io/example-datasets/panCancerAnnotation.RData 
load("panCancerAnnotation.RData")
panCancerAnnotation2<-NULL
for(i in 1:length(panCancerAnnotation)){
    panCancerAnnotation2<-rbind(panCancerAnnotation2,panCancerAnnotation[[i]][,])
  } #Transform list to compleate data.frame

# change the format of the input file as most of the variables are considered as factors due to the use of "," as decimal.
# save the dataframe as .txt
write.table(panCancerAnnotation2, file = "/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/panCancerAnnotation.txt", 
            dec = ",", 
            sep = "\t", 
            quote = F, 
            col.names = T, 
            row.names = F)
                                 
# read it considering the "," as decimals
panCancerAnnotation_corrected <- read.delim("/media/isaias.hernandez/seagate_ICM/AICDA_pancancer/panCancerAnnotation.txt", dec = ",")  



panCancerAnnotation3<-subset(panCancerAnnotation_corrected, panCancerAnnotation_corrected$TCGA.Participant.Barcode %in% TCGA_clinical2$bcr_patient_barcode) #subset to same samples in TCGA_clinical2

TCGA_clinical3<-subset(TCGA_clinical2, TCGA_clinical2$bcr_patient_barcode %in% panCancerAnnotation3$TCGA.Participant.Barcode) #Subset so they have the same info

panCancerAnnotation3<-panCancerAnnotation3[order(factor(panCancerAnnotation3$TCGA.Participant.Barcode, levels = TCGA_clinical3$bcr_patient_barcode)),]#put in same order
all(TCGA_clinical3$bcr_patient_barcode == panCancerAnnotation3$TCGA.Participant.Barcode)#must be true


TCGA_clinical3<- cbind(TCGA_clinical3, panCancerAnnotation3) #add panCancerAnnotation data 


  
#Merge
  ###Add other clinical data
TCGA_clinical2_neopeptides<-merge(TCGA_clinical2_neopeptides,TCGA_clinical3[TCGA_clinical3$bcr_patient_barcode%in%TCGA_clinical2_neopeptides$bcr_patient_barcode,c("bcr_patient_barcode","SNV.Neoantigens","Indel.Neoantigens","Immune.Subtype","T.Cells.CD8","Lymphocytes","C3_subtpe","C5_subtpe","T.Cells.Follicular.Helper","NK.Cells.Activated","T.Cells.CD4.Memory.Activated","Macrophages")],by.x="bcr_patient_barcode",all.x=T)



ggboxplot(data = TCGA_clinical2_neopeptides, y ="CYT_activity", x="Clonal_immuno_AICDA_binary",color = "Clonal_immuno_AICDA_binary")+theme_std()+stat_compare_means(method = "wilcox.test", aes(group = Clonal_immuno_AICDA_binary),label = "p.format",label.x = 1.5)

t<-melt(TCGA_clinical2_neopeptides[],id.vars=c('Clonal_immuno_AICDA_binary',"type"), measure.vars=colnames(TCGA_clinical2_neopeptides)[40:45])


ggboxplot(t,x = "variable", y="value", conf.int = FALSE, ,color="Clonal_immuno_AICDA_binary",xlab = "Variable", ylab = "V",label.rectangle=TRUE, ggtheme=theme_bw(base_size = 14),facet.by = "type") +theme_std()+stat_compare_means(method = "wilcox.test", label.x = 1.5, label="p.signif", aes(group=Clonal_immuno_AICDA_binary))+theme(plot.margin = unit(c(2,1,1,1), "lines"),axis.text.x = element_text(angle=90))+guides(color=guide_legend(title = "ICN AID"))

#c(70:71,81:84)


#The suppressive index consists of 11 well-characterized, highly expressed T-cell checkpoint molecules: ADORA2A (A2AR), CD274 (PD-L1), PDCD1 (PD1), CTLA4, HAVCR2 (TIM3), IDO1, IDO2, PDCD1LG2 (PD-L2), TIGIT, VISTA (C10orf54), and VTCN1 (B7-H4; ref. 14). T-cell dysfunction index consists of 567 genes identified by Schietinger and colleagues (geneset F; ref. 37).

###Add expression of CXCL13,CCR5, STK24, PRKCQ and DOCK2
Exp_tmp<-RNA_counts_pan2[rownames(RNA_counts_pan2)%in%c("CXCL13","CCR5", "STK24", "PRKCQ","DOCK2"),colnames(RNA_counts_pan2)%in%TCGA_clinical2_neopeptides$bcr_patient_barcode]
Exp_tmp<-as.data.frame(t(Exp_tmp))

Exp_tmp$bcr_patient_barcode<-rownames(Exp_tmp)
##Merge
TCGA_clinical2_neopeptides<-merge(TCGA_clinical2_neopeptides,Exp_tmp,by="bcr_patient_barcode")


t<-melt(TCGA_clinical2_neopeptides[],id.vars=c('Clonal_immuno_AICDA_binary',"type"), measure.vars=colnames(TCGA_clinical2_neopeptides)[c(40:45,85:89)])


ggboxplot(t,x = "variable", y="value", conf.int = FALSE, ,color="Clonal_immuno_AICDA_binary",xlab = "Variable", ylab = "V",label.rectangle=TRUE, ggtheme=theme_bw(base_size = 14),facet.by = "type") +theme_std()+stat_compare_means(method = "wilcox.test", label.x = 1.5, label="p.signif", aes(group=Clonal_immuno_AICDA_binary))+theme(plot.margin = unit(c(2,1,1,1), "lines"),axis.text.x = element_text(angle=90))+guides(color=guide_legend(title = "ICN AID"))


######Get a table of results per tumor type and using all tumors for each variable to then plot as a heatmap ..
t<-melt(TCGA_clinical2_neopeptides[],id.vars=c('Clonal_immuno_AICDA_binary',"type"), measure.vars=colnames(TCGA_clinical2_neopeptides)[c(40:45,85:89,70:71,81:84)])


####Graph with complexHeatmap
##Get comparisons

####Graph with complexHeatmap
##Get comparisons
data_heatTCGA<-as.data.frame(compare_means(value~Clonal_immuno_AICDA_binary,group.by = c("variable","type"),data = t))#All tumor types
##Add global value
data_heatTCGA2<-as.data.frame(compare_means(value~Clonal_immuno_AICDA_binary,group.by = "variable",data = t))
data_heatTCGA2$type<-"All Tumors"
data_heatTCGA<-merge(data_heatTCGA,data_heatTCGA2,all=T)

###Get in which group the mean was higher
library(dplyr)
t2<-t %>%
   group_by(Clonal_immuno_AICDA_binary, type,variable) %>% 
  summarise(value = mean(value, na.rm=T))
t2<-as.data.frame(t2)
##Now all Tumors..
t2_2<-t %>%
   group_by(Clonal_immuno_AICDA_binary,variable) %>% 
   summarise(value = mean(value, na.rm=T))
t2_2<-as.data.frame(t2_2)
t2_2$type<-"All Tumors"
#Merge
t2<-merge(t2,t2_2,all=T)
#Reduce data
t2<-cbind(t2[t2$Clonal_immuno_AICDA_binary=="Absence",],t2[t2$Clonal_immuno_AICDA_binary=="Presence","value"])
colnames(t2)[c(4,5)]<-c("value_Absence","value_Presence")
##Calculate log2FC
t2$Log2FC<-log2(t2$value_Presence/t2$value_Absence)
t2$Log2FC[is.na(t2$Log2FC)]<-0
t2$id<-paste0(t2$type,t2$variable)
#FC
t2$FC<-(t2$value_Presence/t2$value_Absence)
t2$FC[is.na(t2$FC)]<-0
t2$id<-paste0(t2$type,t2$variable)


data_heatTCGA$id<-paste0(data_heatTCGA$type,data_heatTCGA$variable)
data_heatTCGA<-merge(data_heatTCGA,t2[,c("id","FC","Log2FC")],by="id")
summary(data_heatTCGA$FC)

data_heatTCGA$Group_AID<-ifelse(data_heatTCGA$FC==1,0,ifelse(data_heatTCGA$FC>1,1,-1))

data_heatTCGA$Log2FC<-log2(data_heatTCGA$FC)
data_heatTCGA$Log2FC[data_heatTCGA$Log2FC==-Inf]<--3.5
##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")
##Import data
data_heatTCGA<-as.data.table(read.delim("Data/Figure6/Figure6B.tab"))

###COmplex Heatmap
data_heatTCGA3<-as.data.frame(dcast(data_heatTCGA, type ~variable , value.var = "Log2FC")) #For values

data_heatTCGA_sig<-as.data.frame(dcast(data_heatTCGA, type ~variable , value.var = "p.signif") )#For annotating significance..
rownames(data_heatTCGA3)<-data_heatTCGA3$type
data_heatTCGA3<-data_heatTCGA3[,-1]

#Re order rows
data_heatTCGA3<-data_heatTCGA3[c(1,11,12,4,7,13,6,10,3,15,9,2,8,5,14),]

##Add number of samples...
##Will add manually:

#Can be obtained with:
rownames(data_heatTCGA3)<-c("All Tumors(n = 2143)","LUAD(n = 138)","LUSC(n = 143)","CESC(n = 142)","KIRC(n = 216)","PRAD(n = 147)","HNSC(n = 241)","LIHC(n = 145)","BRCA(n = 358)","UCEC(n = 78)","LGG(n = 120)","BLCA(n = 119)","KIRP(n = 117)","GBM(n = 106)","THCA(n = 73)")
#rownames(data_heatTCGA3)<-paste0(rownames(data_heatTCGA3),"(n = ",c(sum(table(TCGA_clinical2_neopeptides$type)),table(TCGA_clinical2_neopeptides$type)),")")

#Re order and change colnames...
data_heatTCGA3<-data_heatTCGA3[,c(12:17,8,7,11,9:10,1:6)]
colnames(data_heatTCGA3)<-c("CD8 T cells","Lymphocytes","Follicular Helper T cells","NK activated","CD4 memory activated T cells","Macrophages","CXCL13","CCR5","STK24","DOCK2","PRKCQ","PD1","PDL1","PDL2","CTLA-4","TIM3","LAG3")

##Data for significant annotation
rownames(data_heatTCGA_sig)<-data_heatTCGA_sig$type
data_heatTCGA_sig<-data_heatTCGA_sig[,-1]
data_heatTCGA_sig[data_heatTCGA_sig=="ns"]<-"";data_heatTCGA_sig[is.na(data_heatTCGA_sig)]<-""
##Re-order
data_heatTCGA_sig<-data_heatTCGA_sig[c(1,11,12,4,7,13,6,10,3,15,9,2,8,5,14),c("T.Cells.CD8.x","Lymphocytes.x","T.Cells.Follicular.Helper","NK.Cells.Activated","T.Cells.CD4.Memory.Activated","Macrophages","CXCL13","CCR5", "STK24","DOCK2","PRKCQ","PD1_exp", "PDL1_exp","PDL2_exp","CTLA4_exp","TIM3_exp","LAG3_exp")]


##For colors
col_fun3 = colorRamp2(c(-3.5,-.4,0,.4,3.5), c("firebrick",scales::alpha("orangered2",.8), "white","lightskyblue","royalblue4"))

TCGA_heat_ICN<-Heatmap(as.matrix(data_heatTCGA3), name = "Log2(FC)", col = col_fun3, column_title = "ICN AID Presence vs Absence across TCGA tumor types",column_names_side = "top", column_names_rot = 45,row_names_side = "left",show_column_dend = FALSE,show_row_dend =F, cluster_columns = FALSE,cluster_rows = F,column_split =  c(rep(c("A"), 6),rep(c("B"), 2),rep(c("C"), 3),rep(c("D"), 6)),column_title_gp = gpar(fontsize = 18, fontface = "bold",col="black"),column_names_gp = gpar(fontsize=10),border = TRUE,show_column_names = T, heatmap_legend_param = list(title= "Log2(FC)", at = c(-3.5, -1.75,-.5,0, .5,1.75,3.5), labels = c(-3.5, -1.75,-.5,0, .5,1.75,3.5)),cell_fun = function(j, i, x, y, width, height, fill) { grid.text( data_heatTCGA_sig[i, j], x, y, gp = gpar(fontsize = 10))
},column_gap = unit(2, "mm"),top_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = c("plum3","red","chocolate1","darkgreen")),
                                                                                 labels = c("Immune cell population", "T cell reactivity\nto clonal neoepitopes", "Others","Immune checkpoint"), 
                                                                                 labels_gp = gpar(col = "black", fontsize = 10,fontface=2))))



draw(TCGA_heat_ICN,annotation_legend_list=Legend(title = "p-value", at = c(0, 1, 2, 3), type = "points",pch = c("****","***","**","*",""),labels = c("<= 0.0001", "<= 0.001", "<= 0.01", "<= 0.05","ns"),grid_width = unit(.8, "cm")))


Fig 6C

##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")
##Riaz et al cohort, melanoma patients treated with ICI
##Download data from https://github.com/riazn/bms038_analysis/tree/master/data

###Import mutation data
Riaz_MAF <- read_excel("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz2017_MAF_PreTx.xlsx", skip = 3)
Riaz_MAF<-as.data.frame(Riaz_MAF)
dim(Riaz_MAF)
load("/media/user/seagate_ICM/AICDA_analysis/ICGC/APOBEC_enrich_function_WES_WGS.RData")
load("/media/user/seagate_ICM/AICDA_analysis/ICGC/AICDA_enrich_function_WES_WGS.RData")
##According to the methods of the original article the maf file is annotated according to the hg19
colnames(Riaz_MAF)[1:6]<-c("Tumor_Sample_Barcode","Hugo_Symbol","Chromosome","Start_Position","End_Position","Variant_Classification")
Riaz_MAF$Reference_Allele<-substr(x = gsub(pattern = ">.*",replacement = "",x = Riaz_MAF$HGVS_c),start = nchar(gsub(pattern = ">.*",replacement = "",x = Riaz_MAF$HGVS_c)),stop = nchar(gsub(pattern = ">.*",replacement = "",x = Riaz_MAF$HGVS_c)))
  Riaz_MAF$Tumor_Seq_Allele2<-gsub(pattern = ".*>",replacement = "",x = Riaz_MAF$HGVS_c)
  Riaz_MAF$Variant_type<-"SNP"
  ##Add this to allow calculations
Riaz_MAF$Variant_Classification<-ifelse(Riaz_MAF$Variant_Classification=="missense_variant"|Riaz_MAF$Variant_Classification=="missense_variant&splice_region_variant","Missense_Mutation",ifelse(Riaz_MAF$Variant_Classification%in%c("splice_acceptor_variant&intron_variant","splice_donor_variant&intron_variant","start_lost&splice_region_variant","stop_gained&splice_region_variant","splice_acceptor_variant&splice_donor_variant&intron_variant"),"Splice_Site",no = "Nonsense_Mutation")) #Assign to a variant,   
  
Riaz_MAF2<-read.maf(Riaz_MAF) #Transform to maf by maftools
  library(BSgenome.Hsapiens.UCSC.hg19)
Riaz_MAF_AICDA <- AICDA_enrichment(Riaz_MAF2, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE, background_n = 60,seq_type="WGS") #
nrow(Riaz_MAF_AICDA$maf_AICDA)/nrow(Riaz_MAF)

##Assign mutations to AICDA provoked or not...
Riaz_MAF$Mut_identifier<-paste0(Riaz_MAF$Tumor_Sample_Barcode,"-",Riaz_MAF$Start_Position,"-",Riaz_MAF$HGVS_c,"-",Riaz_MAF$Tcov)

Riaz_MAF_AICDA$maf_AICDA$Mut_identifier<-paste0(Riaz_MAF_AICDA$maf_AICDA$Tumor_Sample_Barcode,"-",Riaz_MAF_AICDA$maf_AICDA$Start_Position,"-",Riaz_MAF_AICDA$maf_AICDA$HGVS_c,"-",Riaz_MAF_AICDA$maf_AICDA$Tcov)
####
Riaz_MAF$AICDA_mutation<-ifelse(Riaz_MAF$Mut_identifier %in% Riaz_MAF_AICDA$maf_AICDA$Mut_identifier, "YES","NO")###Add a column next to each mutation to indicate if it is AICDA induced or not


table( Riaz_MAF$AICDA_mutation)

###Get the %
table( Riaz_MAF$AICDA_mutation)/nrow(Riaz_MAF)*100
###This is higher than previously seen in other tumor ~ 5 % and even slightly higher than hematologicals ~8%

####Import clinical data and save the fraction of AICDA/APOBEC mutations
Riaz_Clinical <- read_excel("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz2017_clinical.xlsx",skip=2)
Riaz_Clinical<-as.data.frame(Riaz_Clinical)
colnames(Riaz_Clinical)[1]<-"Tumor_Sample_Barcode"
Riaz_Clinical <- merge(x = Riaz_Clinical, y = Riaz_MAF_AICDA$AICDA_scores[,c("Tumor_Sample_Barcode","fraction_AICDA_mutations")], by = "Tumor_Sample_Barcode") #This subset to the common 68 samples that are present in the maf file

##Now APOBEC
Riaz_MAF2_APOBEC <- APOBEC_enrichment(Riaz_MAF2, ref_genome = "BSgenome.Hsapiens.UCSC.hg19",prefix = 'chr', add = TRUE,seq_type="WGS")
Riaz_Clinical <- merge(x = Riaz_Clinical, y = Riaz_MAF2_APOBEC$APOBEC_scores[,c("Tumor_Sample_Barcode","fraction_APOBEC_mutations")], by = "Tumor_Sample_Barcode")

##For analysis separate Ipi-P from Ipi-N since different response and OS was shown in the original article..
Riaz_Clinical$Response_group<-ifelse(Riaz_Clinical$Response%in%c("CR","PR"),"CR/PR",Riaz_Clinical$Response)

t<-melt(Riaz_Clinical[Riaz_Clinical$Response_group%nin%"NE",],id.vars=c('Response_group',"Cohort"), measure.vars=c('fraction_AICDA_mutations','fraction_APOBEC_mutations')) #Do not consider NE samples (Not evaluable)
Riaz_Clinical2<-Riaz_Clinical[Riaz_Clinical$Response_group%nin%"NE",]
#Add line global median, add n samples to labels...

colors_riaz<-c("limegreen","orange2","blueviolet")
  
  p_tmp1<- ggboxplot(data = t[t$Cohort=="NIV3-NAIVE",], y ="value", x="variable",fill  = "Response_group",xlab = "Signature",ylab = "Fraction of SNP mutations",title = "Riaz study (n = 30; Ipi-Naive)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-NAIVE"&t$variable=="fraction_APOBEC_mutations"]),color="APOBEC"), linetype="dashed",show.legend = TRUE)+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-NAIVE"&t$variable=="fraction_AICDA_mutations"]),color="AICDA"), linetype="dashed",show.legend = TRUE)+guides(color=guide_legend(title = "Signature"),fill=guide_legend(title = "Clinical group"))+scale_x_discrete(labels=c(paste0("AICDA (median = ",round(median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]),3),")"),paste0("APOBEC (median = ",round(median(Riaz_Clinical2$fraction_APOBEC_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]),3),")")))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)+scale_color_manual(values = c(AICDA="#F8766D",APOBEC="#00BFC4"),labels=c("AICDA","APOBEC"))
  

p_tmp2<- ggboxplot(data = t[t$Cohort=="NIV3-PROG",], y ="value", x="variable",fill  = "Response_group",xlab = "Signature",ylab = "Fraction of SNP mutations",title = "Riaz study (n = 35; Ipi-PROG)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-PROG"&t$variable=="fraction_APOBEC_mutations"]),color="APOBEC"), linetype="dashed",show.legend = TRUE)+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-PROG"&t$variable=="fraction_AICDA_mutations"]),color="AICDA"), linetype="dashed",show.legend = TRUE)+guides(color=guide_legend(title = "Signature"),fill=guide_legend(title = "Clinical group"))+scale_x_discrete(labels=c(paste0("AICDA (median = ",round(median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]),3),")"),paste0("APOBEC (median = ",round(median(Riaz_Clinical2$fraction_APOBEC_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]),3),")")))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)+scale_color_manual(values = c(AICDA="#F8766D",APOBEC="#00BFC4"),labels=c("AICDA","APOBEC"))
  

Riaz_fractionMuts<-ggarrange(p_tmp1+ylim(c(0,.2)),p_tmp2+theme(axis.title.y = element_blank(),axis.ticks.y = element_blank(),axis.line.y = element_blank(),axis.text.y = element_blank())+ylim(c(0,.2)),nrow = 1,common.legend = T,legend = "right")


ggsave(Riaz_fractionMuts,filename = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Boxplot_AICDA_APOBEC_fraction_Riaz_ICI.pdf",width = 14,height = 12)

##This figure was not included in the manuscript but show higher number of AICDA muts than those of APOBEC

###Quick survival analysis

Riaz_Clinical2$OS_status<-ifelse(Riaz_Clinical2[,4]=="TRUE",1,0)
Riaz_Clinical2$OS<-Riaz_Clinical2[,5]/52.1429 #Transform to years

#OS in Ipi-N patients by mutation load (high mutation load defined as >100 mutations); article said that high mutational load was correlated with better survival
Riaz_Clinical2$`Mutation Load`<-as.numeric(Riaz_Clinical2$`Mutation Load`)
Riaz_Clinical2$Mut_load_binary<-ifelse(Riaz_Clinical2$`Mutation Load`>100,"High","Low")
summary(coxph(Surv(OS, OS_status) ~  Mut_load_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",]))
##Low mutational load associated with worse survival 0.02
summary(coxph(Surv(OS, OS_status) ~  Mut_load_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",]))
#Not significant

Riaz_Clinical2$AICDA_binary[Riaz_Clinical2$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"] > median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]), "High","Low") #Median depending on the group
Riaz_Clinical2$AICDA_binary[Riaz_Clinical2$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"] > median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]), "High","Low")

Riaz_Clinical2$APOBEC_binary[Riaz_Clinical2$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical2$fraction_APOBEC_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"] > median(Riaz_Clinical2$fraction_APOBEC_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]), "High","Low") #Median depending on the group
Riaz_Clinical2$APOBEC_binary[Riaz_Clinical2$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical2$fraction_APOBEC_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"] > median(Riaz_Clinical2$fraction_APOBEC_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]), "High","Low")



summary(coxph(Surv(OS, OS_status) ~  AICDA_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",]))
##Associated with response in naive, low number = worse response with p=0.033
summary(coxph(Surv(OS, OS_status) ~  AICDA_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",]))
#Not significant
summary(coxph(Surv(OS, OS_status) ~  APOBEC_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",]))
summary(coxph(Surv(OS, OS_status) ~  APOBEC_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",]))
#Not significant




####Mutational signatures...
###Compute mutational signatures for the mutations not related to AICDA
Riaz_MAF_noAICDA<-Riaz_MAF[Riaz_MAF$AICDA_mutation=="NO",]

load("/home/user/Palimpsest-master/RUNNING_PALIMPSEST_EXAMPLE/LiC1162/vcf.RData")
Riaz_MAF_noAICDA<-Riaz_MAF_noAICDA[,c(1,14,3,4,12,13,9,10,2,15)]
colnames(Riaz_MAF_noAICDA) <- c(colnames(vcf[c(1:7,8)]),"Gene_Name","Mut_identifier")

Riaz_MAF_noAICDA$Type <- ifelse(Riaz_MAF_noAICDA$Type == "SNP", yes = "SNV", ifelse(Riaz_MAF_noAICDA$Type == "DEL", yes = "DEL","INS"))

Riaz_MAF_noAICDA<-as.data.frame(Riaz_MAF_noAICDA)
Riaz_MAF_noAICDA$Sample<-as.character(Riaz_MAF_noAICDA$Sample)
#Preparing input data for mutational signature analysis
##Annotate mutations
##Eliminate NA in Type
Riaz_MAF_noAICDA<-Riaz_MAF_noAICDA[!is.na(Riaz_MAF_noAICDA$Type),]
##Take out wrongly format variants in which REF==ALT
Riaz_MAF_noAICDA<-Riaz_MAF_noAICDA[Riaz_MAF_noAICDA$REF!=Riaz_MAF_noAICDA$ALT,]

Riaz_MAF_noAICDA<-Riaz_MAF_noAICDA[Riaz_MAF_noAICDA$Type%in%"SNV",]#Subset to SNPS to avoid error

vcf_Riaz_MAF_noAICDA<-annotate_VCF(vcf=Riaz_MAF_noAICDA,ref_genome =BSgenome.Hsapiens.UCSC.hg19, ref_fasta=NULL , add_ID_cats=F,add_DBS_cats = F)
dim(vcf_Riaz_MAF_noAICDA)
exclude_sigs<-c("SBS27","SBS39","SBS43","SBS45" , "SBS46" , "SBS47" , "SBS48"  ,"SBS49" , "SBS50" , "SBS51" , "SBS52" , "SBS53" , "SBS54" , "SBS55","SBS56" , "SBS57" , "SBS58" , "SBS59" , "SBS60")#Signatures not to include to see if it helps the fitting,  18 are posible sequencing artefacts; SBS39 posible noise causing with AICDA signature

load("/r/Mutational_Signature_colors.RData")

SBS_cosmic_bytumor<-SBS_cosmic[setdiff(rownames(SBS_cosmic),exclude_sigs),]##l
SBS_input<-palimpsest_input(vcf = vcf_Riaz_MAF_noAICDA, Type = "SBS") #Get tnm proportions and numbers by sample
    
    resdir.2<-paste0("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Mutational_signatures/");if(!file.exists(resdir.2)) dir.create(resdir.2) #Create directory if it does not exist
    

SBS_signatures <- deconvolution_fit(input_vcf = vcf_Riaz_MAF_noAICDA,input_matrices = SBS_input,input_signatures = SBS_cosmic_bytumor,signature_colours = sig_cols,resdir = resdir.2,doplot = F,save_signatures_exp = F) #Calcules weight for each signature

select <- which(colMeans(SBS_signatures$sig_nums) > 1) #Select signatures with real contribution
#Re-fit using only selected signatures
#selected mutational known signatures in tumor[i] samples   
X<-as.character(c(names(select),"SBS13"))
signatures3 <-SBS_cosmic_bytumor[X,]

SBS_signatures <- deconvolution_fit(input_vcf = vcf_Riaz_MAF_noAICDA,input_matrices = SBS_input,input_signatures = signatures3,signature_colours = sig_cols,resdir = resdir.2,doplot = F,save_signatures_exp = T) #Calcules weight for each signature, re-fit to improve visualization

####Assign mutations to each signature...

vcf_Riaz_MAF_noAICDA<- signature_origins(input =vcf_Riaz_MAF_noAICDA,Type = "SBS" ,input_signatures =SBS_cosmic_bytumor[colnames(SBS_signatures$sig_nums),] ,signature_contribution = SBS_signatures)

##Add an AICDA column with 0
vcf_Riaz_MAF_noAICDA$AICDA_canonical.prob<-0
head(vcf_Riaz_MAF_noAICDA)
##Merge both data sets
vcf_Riaz_MAF_AICDA<-Riaz_MAF[Riaz_MAF$AICDA_mutation=="YES",]

load("/home/user/Palimpsest-master/RUNNING_PALIMPSEST_EXAMPLE/LiC1162/vcf.RData")
vcf_Riaz_MAF_AICDA<-vcf_Riaz_MAF_AICDA[,c(1,14,3,4,12,13,9,10,2,15)]
colnames(vcf_Riaz_MAF_AICDA) <- c(colnames(vcf[c(1:7,8)]),"Gene_Name","Mut_identifier")

vcf_Riaz_MAF_noAICDA$AICDA_mutation<-"NO"
vcf_Riaz_MAF_AICDA$AICDA_mutation<-"YES"

vcf_sigOrigins_allmuts<- merge(vcf_Riaz_MAF_noAICDA,vcf_Riaz_MAF_AICDA,all=TRUE)
vcf_sigOrigins_allmuts$SBS.Sig.max[vcf_sigOrigins_allmuts$AICDA_mutation=="YES"]<-"AICDA_canonical"
# Add signature origin to original maf

Riaz_MAF$SBS.Sig.max<-vcf_sigOrigins_allmuts$SBS.Sig.max[match(Riaz_MAF$Mut_identifier,vcf_sigOrigins_allmuts$Mut_identifier)]



##Get the number of mutations per sample per signature
Riaz_MAF$mut_count<-1
Riaz_MAF$SBS.Sig.max_reduced<-ifelse(Riaz_MAF$SBS.Sig.max%in%c("SBS7a","SBS7b"),"UV",Riaz_MAF$SBS.Sig.max)
Riaz_MAF$SBS.Sig.max_reduced[is.na(Riaz_MAF$SBS.Sig.max_reduced)]<-"Other"  
  
riaz_melt<-Riaz_MAF[] %>% group_by(Tumor_Sample_Barcode,SBS.Sig.max_reduced) %>% dplyr::summarise(mut_sum = sum(mut_count))
riaz_melt<-as.data.frame(riaz_melt)
head(riaz_melt)
dim(riaz_melt)
riaz_melt2<-Riaz_MAF[] %>% group_by(Tumor_Sample_Barcode) %>% dplyr::summarise(Mut_load = sum(mut_count))
riaz_melt2<-as.data.frame(riaz_melt2)
head(riaz_melt2)

riaz_melt$Fraction_mutations<-riaz_melt$mut_sum/riaz_melt2$Mut_load[match(riaz_melt$Tumor_Sample_Barcode,riaz_melt2$Tumor_Sample_Barcode)]

###Add to the clinical data the fr
riaz_melt2<-riaz_melt[riaz_melt$SBS.Sig.max_reduced=="UV",]
Riaz_Clinical2$fraction_UV_mutations<-riaz_melt2$Fraction_mutations[match(Riaz_Clinical2$Tumor_Sample_Barcode,riaz_melt2$Tumor_Sample_Barcode)]

Riaz_Clinical2$fraction_UV_mutations[is.na(Riaz_Clinical2$fraction_UV_mutations)]<-0 #fill with O if NA
###New boxplot...

t<-melt(Riaz_Clinical2,id.vars=c('Response_group',"Cohort"), measure.vars=c('fraction_AICDA_mutations','fraction_UV_mutations')) #Do not consider NE samples (Not evaluable)

#Add line global median, add n samples to labels...

colors_riaz<-c("limegreen","orange2","blueviolet")
  
  p_tmp1<- ggboxplot(data = t[t$Cohort=="NIV3-NAIVE",], y ="value", x="variable",fill  = "Response_group",xlab = "Signature",ylab = "Fraction of SNP mutations",title = "Riaz study (n = 30; Ipi-Naive)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-NAIVE"&t$variable=="fraction_UV_mutations"]),color="UV"), linetype="dashed",show.legend = TRUE)+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-NAIVE"&t$variable=="fraction_AICDA_mutations"]),color="AICDA"), linetype="dashed",show.legend = TRUE)+guides(color=guide_legend(title = "Signature"),fill=guide_legend(title = "Clinical group"))+scale_x_discrete(labels=c(paste0("AICDA (median = ",round(median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]),3),")"),paste0("UV (median = ",round(median(Riaz_Clinical2$fraction_UV_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]),3),")")))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)+scale_color_manual(values = c(AICDA="#F8766D",UV="#00BFC4"),labels=c("AICDA","UV"))
  

p_tmp2<- ggboxplot(data = t[t$Cohort=="NIV3-PROG",], y ="value", x="variable",fill  = "Response_group",xlab = "Signature",ylab = "Fraction of SNP mutations",title = "Riaz study (n = 35; Ipi-PROG)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-PROG"&t$variable=="fraction_UV_mutations"]),color="UV"), linetype="dashed",show.legend = TRUE)+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-PROG"&t$variable=="fraction_AICDA_mutations"]),color="AICDA"), linetype="dashed",show.legend = TRUE)+guides(color=guide_legend(title = "Signature"),fill=guide_legend(title = "Clinical group"))+scale_x_discrete(labels=c(paste0("AICDA (median = ",round(median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]),3),")"),paste0("UV (median = ",round(median(Riaz_Clinical2$fraction_UV_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]),3),")")))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)+scale_color_manual(values = c(AICDA="#F8766D",UV="#00BFC4"),labels=c("AICDA","UV"))
  

Riaz_fractionMuts_UV<-ggarrange(p_tmp2+ylim(c(0,.5)),p_tmp1+theme(axis.title.y = element_blank(),axis.ticks.y = element_blank(),axis.line.y = element_blank(),axis.text.y = element_blank())+ylim(c(0,.5)),nrow = 1,common.legend = T,legend = "right")


#Add line global median, add n samples to labels...

##Check the ploting for 6C left side

##############
#########Now the KM plots on the right side
Riaz_Clinical2$UV_binary <- ifelse(Riaz_Clinical2$fraction_UV_mutations > median(Riaz_Clinical2$fraction_UV_mutations), "High","Low")


summary(coxph(Surv(OS, OS_status) ~  AICDA_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",]))
##Associated with response in naive, low number = worse response with p=0.033
summary(coxph(Surv(OS, OS_status) ~  AICDA_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",]))
#Not significant
summary(coxph(Surv(OS, OS_status) ~  UV_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",]))
summary(coxph(Surv(OS, OS_status) ~  UV_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",]))
#Not significant


###Create KM curves for figures....
##MUt load and AICDA in main for Naive.... UV in sup for naive and all prog...

fit <- survfit(Surv(OS, OS_status) ~ AICDA_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",])

psurv1<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - AICDA mutational load ",legend.title = "AICDA mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)



fit <- survfit(Surv(OS, OS_status) ~ UV_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",])

psurv2<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - UV mutational load ",legend.title = "UV mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)

# Arrange with the boxplots...
psurv3 <-arrange_ggsurvplots(list(psurv1,psurv2), print = FALSE, ncol = 2, nrow = 1)

###



####
##
####Supplementary KM curves...

#UV in naive
fit <- survfit(Surv(OS, OS_status) ~ Mut_load_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",])

psurv1<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - Mutation load ",legend.title = "Mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)


#Other in progressive

#AICDA prog

fit <- survfit(Surv(OS, OS_status) ~ AICDA_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",])

psurv2<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",],
  size = 1,                 # change line size
  title = "Ipi-PROG OS (Riaz study) - Mutation load ",legend.title = "AICDA mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)

#UV prog

fit <- survfit(Surv(OS, OS_status) ~ UV_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",])

psurv3<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",],
  size = 1,                 # change line size
  title = "Ipi-PROG OS (Riaz study) - UV mutations ",legend.title = "UV mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)


#Other in progressive
#Mutation load
fit <- survfit(Surv(OS, OS_status) ~ Mut_load_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",])

psurv4<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-PROG",],
  size = 1,                 # change line size
  title = "Ipi-PROG OS (Riaz study) - Mutation load ",legend.title = "Mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)


# Arrange multiple ggsurvplots and print the output
psurv_tmp <-arrange_ggsurvplots(list(psurv1,psurv2,psurv3,psurv4), print = FALSE, ncol = 2, nrow = 2)

## Not run: 
# Arrange and save into pdf file

ggsave("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/AICDA_mutLoad_UV_KM_Riaz_SUpplementary.pdf", psurv_tmp, width = 14,height = 12)

##That is Supplementary Figure 23A

#############
############
###########
##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")

###Figure 6C
#Import Data

Riaz_Clinical2<-as.data.frame(read.delim("Data/Figure6/Figure6C_1.tab",check.names = T))

t<-melt(Riaz_Clinical2,id.vars=c('Response_group',"Cohort"), measure.vars=c('fraction_AICDA_mutations','fraction_UV_mutations')) #Do not consider NE samples (Not evaluable)

#Add line global median, add n samples to labels...

colors_riaz<-c("limegreen","orange2","blueviolet")
  
  p_tmp1<- ggboxplot(data = t[t$Cohort=="NIV3-NAIVE",], y ="value", x="variable",fill  = "Response_group",xlab = "Signature",ylab = "Fraction of SNP mutations",title = "Riaz study (n = 30; Ipi-Naive)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-NAIVE"&t$variable=="fraction_UV_mutations"]),color="UV"), linetype="dashed",show.legend = TRUE)+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-NAIVE"&t$variable=="fraction_AICDA_mutations"]),color="AICDA"), linetype="dashed",show.legend = TRUE)+guides(color=guide_legend(title = "Signature"),fill=guide_legend(title = "Clinical group"))+scale_x_discrete(labels=c(paste0("AICDA (median = ",round(median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]),3),")"),paste0("UV (median = ",round(median(Riaz_Clinical2$fraction_UV_mutations[Riaz_Clinical2$Cohort=="NIV3-NAIVE"]),3),")")))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)+scale_color_manual(values = c(AICDA="#F8766D",UV="#00BFC4"),labels=c("AICDA","UV"))
  

p_tmp2<- ggboxplot(data = t[t$Cohort=="NIV3-PROG",], y ="value", x="variable",fill  = "Response_group",xlab = "Signature",ylab = "Fraction of SNP mutations",title = "Riaz study (n = 35; Ipi-PROG)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-PROG"&t$variable=="fraction_UV_mutations"]),color="UV"), linetype="dashed",show.legend = TRUE)+ geom_hline(aes(yintercept=median(t$value[t$Cohort=="NIV3-PROG"&t$variable=="fraction_AICDA_mutations"]),color="AICDA"), linetype="dashed",show.legend = TRUE)+guides(color=guide_legend(title = "Signature"),fill=guide_legend(title = "Clinical group"))+scale_x_discrete(labels=c(paste0("AICDA (median = ",round(median(Riaz_Clinical2$fraction_AICDA_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]),3),")"),paste0("UV (median = ",round(median(Riaz_Clinical2$fraction_UV_mutations[Riaz_Clinical2$Cohort=="NIV3-PROG"]),3),")")))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)+scale_color_manual(values = c(AICDA="#F8766D",UV="#00BFC4"),labels=c("AICDA","UV"))
  

Riaz_fractionMuts_UV<-ggarrange(p_tmp2+ylim(c(0,.5)),p_tmp1+theme(axis.title.y = element_blank(),axis.ticks.y = element_blank(),axis.line.y = element_blank(),axis.text.y = element_blank())+ylim(c(0,.5)),nrow = 1,common.legend = T,legend = "right")




##############
#########Now the KM plots on the right side

fit <- survfit(Surv(OS, OS_status) ~ AICDA_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",])

psurv1<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - AICDA mutational load ",legend.title = "AICDA mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)



fit <- survfit(Surv(OS, OS_status) ~ UV_binary, data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",])

psurv2<-ggsurvplot(
  fit,
  data = Riaz_Clinical2[Riaz_Clinical2$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - UV mutational load ",legend.title = "UV mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
)

# Arrange with the boxplots...
psurv3 <-arrange_ggsurvplots(list(psurv1,psurv2), print = FALSE, ncol = 2, nrow = 1)



xbp_grob2 <-ggarrange(psurv1$plot,NULL,psurv2$plot,align = "hv",nrow = 3,heights = c(2,.5,2))

Riaz_fractionMuts_UV2<-ggarrange(Riaz_fractionMuts_UV,NULL,xbp_grob2,ncol=3,widths = c(1,.2,1))

Riaz_fractionMuts_UV2


Fig 6D

##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")

###Calculate clonality for each gene/mutation per patient...
library(nrstats)#install.packages("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/bms038_analysis-master/packages/nrstats_0.1.0.tar.gz",repos = NULL, type="source")
library(ggplot2)
library(grid)
library(sqldf)

####Get clonality of each mutation by reading pyclone output..

readPyCloneDat <- function(dirLoc) {
  fls <- dir(dirLoc, "*ci95.tsv");
  #print(sprintf("here: reading %s", fls))
  ids <- c();
  median_ccfs <- c();
  muts <- c();
  clusters <- c();
  clusters_2muts <- c();
  thresh95 <- c()
  thresh90 <- c()
  thresh85 <- c()

  for (f in fls) {
    # Read in Cellular fraciton data for sample and determine median CCF for patient 
    print(f)
    f2<-paste(dirLoc,f,sep="")
    dat<-read.csv(f2, stringsAsFactors=FALSE, sep="\t")
    
    mc <- median(dat$cellular_prevalence);
    median_ccfs <- c(mc, median_ccfs);
    
    # Parse out patient id from file name
    id <- strsplit(f, "\\.")[[1]][1]
    ids <- c(id, ids)
    
    # Number of clusters (raw)
    num_clus <- length(unique(dat$cluster_id))
    clusters <- c(num_clus, clusters);
    
    
    # Number of clusters with at least 2 mutations
    tmp<-sqldf("select COUNT(*) as res from dat group by cluster_id")
    num_clus2 <- sum(tmp > 1);
    clusters_2muts <- c(num_clus2, clusters_2muts);
    
    # Fraction Clonal/Subclonal
    n <- dim(dat)[1]
    muts <- c(n, muts)
    thresh95 <- c(sum(dat$CI95 >= 0.95)/n, thresh95)
    thresh90 <- c(sum(dat$CI95 >= 0.90)/n, thresh90)
    thresh85 <- c(sum(dat$CI95 >= 0.85)/n, thresh85)
    
  }
  
  ccfdat <- data.frame(ids=ids, ccf=median_ccfs, clusters=clusters, clusters_2muts = clusters_2muts,
                       thresh95=thresh95, thresh90=thresh90, 
                       thresh85=thresh85, pyclone_muts=muts, stringsAsFactors=FALSE);
  rownames(ccfdat)<-ccfdat$ids
  return(ccfdat);
} ##This is to extract clonal mutational load

######################################################

# Create clonal mutation load data from pyclone output
indir <- paste("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/pyclone/pre/")
##Import CN data...it was first merge by sample...
setwd(indir)

dat <- readPyCloneDat(indir)
dat$Tumor_Sample_Barcode<-gsub(pattern = "_pre","",dat$ids)

Riaz_Clinical3 <- merge(Riaz_Clinical2, dat[,c("Tumor_Sample_Barcode","ccf","thresh95","pyclone_muts")], by="Tumor_Sample_Barcode")
##Not all samples have clonality data avaliable...8 patients lost... total = 57

Riaz_Clinical3$thresh95.muts <- log10(Riaz_Clinical3$`Mutation Load`) * Riaz_Clinical3$thresh95
# Check clonal mutation load and survival (cut point at median)
# Overall Survival by Clonal Mutation load in Ipi-Naive Patients (Figure 1B of original article)
Riaz_Clinical3$clon_cut[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$clon_cut[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 

# Create clonal mutation load data from pyclone output
indir <- paste("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/pyclone/pre/")
##Import CN data...it was first merge by sample...
setwd(indir)

dat <- readPyCloneDat(indir)
dat$Tumor_Sample_Barcode<-gsub(pattern = "_pre","",dat$ids)

Riaz_Clinical3 <- merge(Riaz_Clinical2, dat[,c("Tumor_Sample_Barcode","ccf","thresh95","pyclone_muts")], by="Tumor_Sample_Barcode")
##Not all samples have clonality data avaliable...8 patients lost... total = 57

Riaz_Clinical3$thresh95.muts <- log10(Riaz_Clinical3$`Mutation Load`) * Riaz_Clinical3$thresh95
# Check clonal mutation load and survival (cut point at median)
# Overall Survival by Clonal Mutation load in Ipi-Naive Patients (Figure 1B of original article)
Riaz_Clinical3$clon_cut[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$clon_cut[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$thresh95.muts[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 

fit <- survfit(Surv(OS, OS_status) ~ clon_cut, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",])

ggsurvplot(
  fit,
  data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-NAIVE OS (Riaz study) - Clonal mutation load ",legend.title = "Clonal mutation load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
    # Change ggplot2 theme
) ##Sifnificant...for naive 0.00039 but not for Prog...

summary(coxph(Surv(OS, OS_status) ~  clon_cut, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))



###This is to extract which mutations are clonal or subclonal
indir<-"/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/pyclone/pre"
list_of_files <- list.files(path = indir, recursive = TRUE, pattern = "*ci95.tsv", full.names = TRUE)

pyclon<-rbindlist(sapply(list_of_files, fread, simplify = FALSE),            use.names = TRUE, idcol = "FileName")
##Clean sample name
pyclon<-as.data.frame(pyclon)
pyclon$Clonality<-ifelse(pyclon$CI95<0.95,"Subclonal","Clonal")
Riaz_MAF$mutation_id<-paste0(Riaz_MAF$Tumor_Sample_Barcode,"_pre:",Riaz_MAF$Hugo_Symbol,":chr",Riaz_MAF$Chromosome,"_",Riaz_MAF$Start_Position,"_")  #Do this to match and assign clonality to maf (some 8 samples are missing and hence some mutations 8233...)
colnames(pyclon)[2]<-"Mutation_id_number"
pyclon$mutation_id<-gsub('.{3}$',"",pyclon$mutation_id)


Riaz_MAF$Clonality_pre<-pyclon$Clonality[match(Riaz_MAF$mutation_id,pyclon$mutation_id)]
Riaz_MAF$CCF_pre<-pyclon$cellular_prevalence[match(Riaz_MAF$mutation_id,pyclon$mutation_id)]
Riaz_MAF$VAF<-pyclon$variant_allele_frequency[match(Riaz_MAF$mutation_id,pyclon$mutation_id)]
 
  ###Get number of clonal AICDA mutations and try survival analysis..

Riaz_MAF$SBS.Sig.max_reduced2<-ifelse(Riaz_MAF$SBS.Sig.max_reduced%nin%c("UV","AICDA_canonical"),"Other",Riaz_MAF$SBS.Sig.max_reduced)


riaz_melt<-Riaz_MAF[] %>% group_by(Tumor_Sample_Barcode,SBS.Sig.max_reduced2,Clonality_pre) %>% dplyr::summarise(mut_sum = sum(mut_count))
riaz_melt<-as.data.frame(riaz_melt)
head(riaz_melt)
dim(riaz_melt)

###Add to the clinical data the fr
riaz_melt2<-riaz_melt[riaz_melt$SBS.Sig.max_reduced2=="AICDA_canonical"&riaz_melt$Clonality_pre=="Clonal",]
Riaz_Clinical3$AICDA_clonalMuts<-riaz_melt2$mut_sum[match(Riaz_Clinical3$Tumor_Sample_Barcode,riaz_melt2$Tumor_Sample_Barcode)]
riaz_melt2<-riaz_melt[riaz_melt$SBS.Sig.max_reduced2=="UV"&riaz_melt$Clonality_pre=="Clonal",]
Riaz_Clinical3$UV_clonalMuts<-riaz_melt2$mut_sum[match(Riaz_Clinical3$Tumor_Sample_Barcode,riaz_melt2$Tumor_Sample_Barcode)]
riaz_melt2<-riaz_melt[riaz_melt$SBS.Sig.max_reduced2=="Other"&riaz_melt$Clonality_pre=="Clonal",]
Riaz_Clinical3$Other_clonalMuts<-riaz_melt2$mut_sum[match(Riaz_Clinical3$Tumor_Sample_Barcode,riaz_melt2$Tumor_Sample_Barcode)]

Riaz_Clinical3$AICDA_clonalMuts[is.na(Riaz_Clinical3$AICDA_clonalMuts)]<-0;Riaz_Clinical3$UV_clonalMuts[is.na(Riaz_Clinical3$UV_clonalMuts)]<-0;Riaz_Clinical3$Other_clonalMuts[is.na(Riaz_Clinical3$Other_clonalMuts)]<-0 #fill with O if NA
###New boxplot...


t<-melt(Riaz_Clinical3,id.vars=c('Response_group',"Cohort"), measure.vars=c('AICDA_clonalMuts','UV_clonalMuts',"Other_clonalMuts")) #Do not consider NE samples (Not evaluable)

#Add line global median, add n samples to labels...

colors_riaz<-c("limegreen","orange2","blueviolet")
  ggboxplot(data = t[t$Cohort=="NIV3-NAIVE",], y ="value", x="variable",color  = "Response_group",xlab = "Signature",ylab = "Number of clonal mutations",title = "Riaz study (n = 30; Ipi-Naive)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+guides(color=guide_legend(title = "Clinical group"))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)
  
 ggboxplot(data = t[t$Cohort=="NIV3-PROG",], y ="value", x="variable",color  = "Response_group",xlab = "Signature",ylab = "Number of clonal mutations",title = "Riaz study (n = 30; Ipi-PROG)")+theme_std()+theme(axis.title = element_text(face="bold"),plot.title = element_text(hjust = 0.5,face = "bold",size = 15))+guides(color=guide_legend(title = "Clinical group"))+stat_compare_means(method = "wilcox.test",hide.ns = T,label = "p.signif",aes(group=Response_group))+scale_fill_manual(values=colors_riaz)
  
###SUrvival coxph
 summary(coxph(Surv(OS, OS_status) ~  clon_cut, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))
#Significant, previous
Riaz_Clinical3$CLonal_load<-rowSums(Riaz_Clinical3$AICDA_clonalMuts,Riaz_Clinical3$UV_clonalMuts,Riaz_Clinical3$Other_clonalMuts) 
 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$CLonal_load[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$CLonal_load[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$CLonal_load[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$CLonal_load[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 

 summary(coxph(Surv(OS, OS_status) ~  Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))
###Same as the reported in the original article...
 
############
 ###Check with AICDA
 Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$AICDA_clonalMuts[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$AICDA_clonalMuts[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$AICDA_clonalMuts[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$AICDA_clonalMuts[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 

 summary(coxph(Surv(OS, OS_status) ~  Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))
 ###Significant....p=0.0174
 
#####Now for the neoepitopes
 #Import data

##Neoepitopes only contain the gene producing and the sequence but not the genomic position of the original mutation, recover these data by extracting the amminoacid change (check which letter changes from the wt seq to the mut seq).
library(readxl)
Riaz2017_Neoepitopes <- read_excel("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz2017_Neoepitopes.xlsx", 
    skip = 3)
Riaz2017_Neoepitopes<-as.data.frame(Riaz2017_Neoepitopes)

##
Riaz2017_Neoepitopes$WT_AA<-apply(Riaz2017_Neoepitopes,1,function(x){
  pos<-which.min(strsplit(x["WT Peptide"], "")[[1]] == strsplit(x["MT Peptide"], "")[[1]]) #Gets the position
  WT_AA<-strsplit(x["WT Peptide"], "")[[1]][pos]
  #MUT_AA<-strsplit(x["MT Peptide"], "")[[1]][pos]
  return(WT_AA)
})#WT
Riaz2017_Neoepitopes$MUT_AA<-apply(Riaz2017_Neoepitopes,1,function(x){
  pos<-which.min(strsplit(x["WT Peptide"], "")[[1]] == strsplit(x["MT Peptide"], "")[[1]]) #Gets the position
  #WT_AA<-strsplit(x["WT Peptide"], "")[[1]][pos]
  MUT_AA<-strsplit(x["MT Peptide"], "")[[1]][pos]
  return(MUT_AA)
})#MUT


##Now get the amminoacid change from the maf_file with mutations
##change the HGVS_p to HGVSp_Short
library("seqinr")
Riaz_MAF$WT_AA<-substr(gsub(pattern = ".*[.]",replacement = "",x = Riaz_MAF$HGVS_p),start = 1,stop = 3)
Riaz_MAF$MT_AA<-substr(gsub(pattern = ".*[.]",replacement = "",x = Riaz_MAF$HGVS_p),start = nchar(gsub(pattern = ".*[.]",replacement = "",x = Riaz_MAF$HGVS_p))-2,stop = nchar(gsub(pattern = ".*[.]",replacement = "",x = Riaz_MAF$HGVS_p)))
Riaz_MAF$WT_AA<-seqinr::a(Riaz_MAF$WT_AA) #Convert to one letter
Riaz_MAF$MT_AA<-seqinr::a(Riaz_MAF$MT_AA)
##Correlate this data to NeoAntigens to attribute each neoAg to a signature and also clonality...
Riaz2017_Neoepitopes$Mut_id_neo<-paste0(Riaz2017_Neoepitopes$Patient,"_",Riaz2017_Neoepitopes$`Hugo Symbol`,"_",Riaz2017_Neoepitopes$WT_AA,"_",Riaz2017_Neoepitopes$MUT_AA)
Riaz_MAF$Mut_id_neo<-paste0(Riaz_MAF$Tumor_Sample_Barcode,"_",Riaz_MAF$Hugo_Symbol,"_",Riaz_MAF$WT_AA,"_",Riaz_MAF$MT_AA)

#Add clonality
Riaz2017_Neoepitopes$Clonality_pre<-Riaz_MAF$Clonality_pre[match(Riaz2017_Neoepitopes$Mut_id_neo,Riaz_MAF$Mut_id_neo)]
Riaz2017_Neoepitopes$SBS.Sig.max_reduced2<-Riaz_MAF$SBS.Sig.max_reduced2[match(Riaz2017_Neoepitopes$Mut_id_neo,Riaz_MAF$Mut_id_neo)]
#####
######
####Now use prime to calculate immunogenecity of each neoags
###Export table with peptide list and calculate Prime values
Riaz2017_Neoepitopes$`MT Allele`[Riaz2017_Neoepitopes$`MT Allele`==""]<-NA #Replace
peptide_list<-Riaz2017_Neoepitopes$`MT Peptide`[!is.na(Riaz2017_Neoepitopes$`MT Allele`)]

#peptide_list<-peptide_list[-grep("X",peptide_list)]#Take out peptides containing "X" aminoacids
#peptide_list<-peptide_list[-grep("0",peptide_list)]
write.table(quote = F,sep = "\t",row.names = F,col.names = F,x = peptide_list,file="/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz2017_preMutpeptideList.txt")
##HLA names
HLA_list<-unique((Riaz2017_Neoepitopes$`MT Allele`))
HLA_list<-HLA_list[!is.na(HLA_list)]
HLA_list<-paste(as.character(HLA_list), collapse=",")
write.table(quote = F,sep = "\t",row.names = F,col.names = F,x = HLA_list,file="/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz_HLA_list.tab")

##
#
####In bash:
cd /home/user/PRIME-master/
./PRIME -i /media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz2017_preMutpeptideList.txt -o /media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz_mutPeptideSeq_list_out.txt -a C0501,A0201,B1801,A3002,B1402,B2705,A6802,C0802,C0702,A0301,C0602,C0701,A3001,B4001,A1101,A6801,B4402,A2501,B0801,A2902,B0702,A0101,C0303,B4403,A3101,B1501,B5101,C0401,B3501,A2402,A2301,C1203,B3901,A3201,B3503,B5701,B4501,C1402,A3301,A2601,B3801,A0205,A6601 -mix /home/user/MixMHCpred-master/MixMHCpred
######
####
##Import data and attribute each prime value to the corresponding HLA haplotype per patient per neoag
###Import PRIME results...

Riaz_PRIME_out <- read.delim("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Riaz_mutPeptideSeq_list_out.txt", comment.char="#")

##Now is the same length as the PRIME output and in the same order
all(Riaz_PRIME_out$Peptide==Riaz2017_Neoepitopes$`MT Peptide`) ###Sanity check, must be TRUE



#######Assign correct PRIME rank and score to each peptide depending on the patients real HLA
prime_cols<-gsub(pattern = "[.]",replacement = ":",gsub(pattern = "X.Rank_",replacement = "",colnames(Riaz_PRIME_out)[]))
Riaz2017_Neoepitopes$Best_allele<-NA
Riaz2017_Neoepitopes$Prime_Rank<-100
Riaz2017_Neoepitopes$Prime_score<-NA
length(unique(Riaz2017_Neoepitopes$Patient[!is.na(Riaz2017_Neoepitopes$`MT Allele`)]))

for (i in 1:nrow(Riaz2017_Neoepitopes)) {
  tmp_test<-Riaz2017_Neoepitopes$`MT Allele`[i] #
  rank_values<- Riaz_PRIME_out[i,c(which(prime_cols%in%tmp_test),1)] ##Extract values, ##Add "Peptide" column In case there was only one Haplotype
  
  rank_values<-rank_values[1]
  rank_values<-ifelse(!is.null(colnames(rank_values)),gsub(pattern = "[.]",replacement = ":",gsub(pattern = "X.Rank_",replacement = "",colnames(rank_values)[])),"Not_avaliable") #Extract Haplotype
  Riaz2017_Neoepitopes$Best_allele[i]<-rank_values
  Riaz2017_Neoepitopes$Prime_Rank[i]<-ifelse(rank_values!="Not_avaliable",Riaz_PRIME_out[i,which(prime_cols%in%rank_values)],"Not_avaliable")#Extract prime_rank of corresponding matching allele
  Riaz2017_Neoepitopes$Prime_score[i]<-ifelse(rank_values!="Not_avaliable",Riaz_PRIME_out[i,(which(prime_cols%in%rank_values)+1)],"Not_avaliable")#Extract prime_score of corresponding matching allele
  
}


# Patients with atleast one peptide encompassing the mutated residue showing a PRIME %rank score lower or equal to 0.5% with at least one of the patient’s HLA-I alleles were assigned to the group where the mutation would be immunogenic (P+ in Figure 6A). Patients with all peptides showing a PRIME %rank larger or equal to 2% with all HLA-I alleles of the patient were assigned to the group where the mutation would not be immunogenic (P). 
Riaz2017_Neoepitopes$Prime_Rank<-as.numeric(Riaz2017_Neoepitopes$Prime_Rank)
Riaz2017_Neoepitopes$Prime_score<-as.numeric(Riaz2017_Neoepitopes$Prime_score)


###Tag samples with altered antigen-presentation and add to clinical data..
###Get samples

disrupt_samples_riaz<-unique(Riaz_MAF$Tumor_Sample_Barcode[Riaz_MAF$Hugo_Symbol%in%c("HLA-A","HLA-B","HLA-C","CIITA","IRF1","PSME1","PSME2","PSME3","ERAP1","ERAP2","HSPA","HSPC","TAP1","TAP2","TAPBP","CALR","CNX","PDIA3","B2M")])
length(disrupt_samples_riaz)
Riaz_Clinical2$Antigen_disrupted<-ifelse(Riaz_Clinical2$Tumor_Sample_Barcode%in%disrupt_samples_riaz,"YES","NO")
Riaz_Clinical3$Antigen_disrupted<-ifelse(Riaz_Clinical3$Tumor_Sample_Barcode%in%disrupt_samples_riaz,"YES","NO")

Riaz2017_Neoepitopes$Immunogenic_Prime<-ifelse(Riaz2017_Neoepitopes$Prime_Rank<=0.5,"Immunogenic","Non-Immunogenic")


table(Riaz2017_Neoepitopes$Immunogenic_Prime)/nrow(Riaz2017_Neoepitopes)
#33.23% Immunogenic which is higher than the number detected in the TCGA..


Riaz2017_Neoepitopes$Ag_count<-1 #for counting

##Add total clonal (Immunogenic or not)

ag_melt2<-Riaz2017_Neoepitopes[] %>% group_by(Patient,Immunogenic_Prime,Clonality_pre) %>% dplyr::summarise(Ag_sum = sum(Ag_count))
ag_melt2<-as.data.frame(ag_melt2)
###Add numbers to clinical data...
ag_melt_tmp<-ag_melt2[ag_melt2$Immunogenic_Prime=="Immunogenic"&ag_melt2$Clonality_pre=="Clonal",]
#CIN=Clonal immunogenic neopeptides
Riaz_Clinical3$CIN<-ag_melt_tmp$Ag_sum[match(Riaz_Clinical3$Tumor_Sample_Barcode,ag_melt_tmp$Patient)]
Riaz_Clinical3$CIN[is.na(Riaz_Clinical3$CIN)]<-0
ag_melt_tmp<-ag_melt2[ag_melt2$Immunogenic_Prime=="Non-Immunogenic"&ag_melt2$Clonality_pre=="Clonal",]
#CINN clonal nonImmunogenic neopeptides
Riaz_Clinical3$CINN<-ag_melt_tmp$Ag_sum[match(Riaz_Clinical3$Tumor_Sample_Barcode,ag_melt_tmp$Patient)]
Riaz_Clinical3$CINN[is.na(Riaz_Clinical3$CINN)]<-0

Riaz_Clinical3$Clonal_neopeptides<-Riaz_Clinical3$CINN+Riaz_Clinical3$CIN

##Quick survival
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 

summary(coxph(Surv(OS, OS_status) ~  Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))
##0.0079* Using all the clonal neopeptides...

##Use only CIN
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
> Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 
 
summary(coxph(Surv(OS, OS_status) ~  Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))
#P val = 0.00538


##Add total clonal (Immunogenic or not) per signature (AICDA or UV)
ag_melt2<-Riaz2017_Neoepitopes[] %>% group_by(Patient,SBS.Sig.max_reduced2,Immunogenic_Prime,Clonality_pre) %>% dplyr::summarise(Ag_sum = sum(Ag_count))
ag_melt2<-as.data.frame(ag_melt2)
###Add numbers to clinical data...
ag_melt_tmp<-ag_melt2[ag_melt2$Immunogenic_Prime=="Immunogenic"&ag_melt2$Clonality_pre=="Clonal"&ag_melt2$SBS.Sig.max_reduced2=="AICDA_canonical",]
#CIN=Clonal immunogenic neopeptides ofr AID
Riaz_Clinical3$CIN_AID<-ag_melt_tmp$Ag_sum[match(Riaz_Clinical3$Tumor_Sample_Barcode,ag_melt_tmp$Patient)]
Riaz_Clinical3$CIN_AID[is.na(Riaz_Clinical3$CIN_AID)]<-0
##For UV
ag_melt_tmp<-ag_melt2[ag_melt2$Immunogenic_Prime=="Immunogenic"&ag_melt2$Clonality_pre=="Clonal"&ag_melt2$SBS.Sig.max_reduced2=="UV",]
#CIN=Clonal immunogenic neopeptides
Riaz_Clinical3$CIN_UV<-ag_melt_tmp$Ag_sum[match(Riaz_Clinical3$Tumor_Sample_Barcode,ag_melt_tmp$Patient)]
Riaz_Clinical3$CIN_UV[is.na(Riaz_Clinical3$CIN_UV)]<-0

##Quick survival

##Usins CIN_AID
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$CIN_AID[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$CIN_AID[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$CIN_AID[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$CIN_AID[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 

summary(coxph(Surv(OS, OS_status) ~  Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))
##0.0387* Using all the CIN_AID

##Use only CIN UV
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$CIN_UV[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$CIN_UV[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$CIN_UV[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$CIN_UV[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 
 
summary(coxph(Surv(OS, OS_status) ~  Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",]))
#P val = 0.0607


########KM plots...c
##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")

#Import Data

Riaz_Clinical3<-as.data.frame(read.delim("Data/Figure6/Figure6D.tab",check.names = T))

Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$Clonal_neopeptides[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 


fit <- survfit(Surv(OS, OS_status) ~ Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",])

psurv1<-ggsurvplot(
  fit,
  data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - Clonal neopeptides load ",legend.title = "CN load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
  # Change ggplot2 theme
)
# Arrange and save into pdf file


##Use only CIN
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] <- ifelse(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-NAIVE"] >= median(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-NAIVE"]), "High","Low") 
Riaz_Clinical3$Clon_cut_tmp[Riaz_Clinical3$Cohort=="NIV3-PROG"] <- ifelse(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-PROG"] >= median(Riaz_Clinical3$CIN[Riaz_Clinical3$Cohort=="NIV3-PROG"]), "High","Low") 
 

fit <- survfit(Surv(OS, OS_status) ~ Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",])

psurv2<-ggsurvplot(
  fit,
  data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - Immunogenic clonal neopeptides load ",legend.title = "ICN load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
  # Change ggplot2 theme
)

xbp_grob2 <-ggarrange(psurv1$plot,NULL,ggarrange(NULL,psurv2$plot,NULL,nrow=3,heights = c(.1,.8,.1)),align = "hv",ncol = 3,widths  = c(2,.5,2))


###
##AICDA ICN
##Usins CIN_AID
Riaz_Clinical3$CIN_AID_binary <- ifelse(Riaz_Clinical3$CIN_AID[] >= median(Riaz_Clinical3$CIN_AID[]), "High","Low") 
##Globally


fit <- survfit(Surv(OS, OS_status) ~ CIN_AID_binary, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",])

psurv3<-ggsurvplot(
  fit,
  data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - ICN AICDA load ",legend.title = "ICN AICDA load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
  # Change ggplot2 theme
)




##Use only CIN UV

Riaz_Clinical3$Clon_cut_tmp <- ifelse(Riaz_Clinical3$CIN_UV[] >= median(Riaz_Clinical3$CIN_UV[]), "High","Low") 


fit <- survfit(Surv(OS, OS_status) ~ Clon_cut_tmp, data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",])

psurv4<-ggsurvplot(
  fit,
  data = Riaz_Clinical3[Riaz_Clinical3$Cohort=="NIV3-NAIVE",],
  size = 1,                 # change line size
  title = "Ipi-Naive OS (Riaz study) - ICN UV load ",legend.title = "ICN UV load",xlab="Time in years" ,surv.median.line = "hv",
  #palette =
  #  c("#E7B800", "#2E9FDF"),# custom color palettes
  #conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  legend.labs =
    c("High", "Low"),    # Change legend labels
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_std()+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))
  # Change ggplot2 theme
)


xbp_grob3 <-ggarrange(psurv3$plot,NULL,psurv4$plot,align = "hv",nrow = 3,heights   = c(1,.2,1))



###Plots

xbp_grob2

########

xbp_grob3


Fig 6E

##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")
#############Import RNA seq Data...to check the expression of some immune checkpoin molecules between the groups...in Ipi-Naive
RNA_counts_Riaz <- read_table2("CountData.BMS038.txt")
RNA_counts_Riaz<-as.data.frame(RNA_counts_Riaz)

###Normalize with Deseq2
library("DESeq2")
library("org.Hs.eg.db")
library("genefilter")
library("Biobase")
# Gene Filter function
filterthegenes <- function(DeseqObjevt){
    chch=DeseqObjevt
    chch <- chch[, !(is.na(chch$Response)) ]
    colData(chch)=droplevels(colData(chch))
    chch <- estimateSizeFactors(chch)

    #Filter expression measure above A in at least k samples
    #Remove low read counts (gene must have 4 or more samples with more or equal to 2 read count)
    mat=assay(chch)
    f1 <- kOverA(k = 4, A = 2,na.rm = T)
    ffun <- filterfun(f1)
    wh1 <- genefilter(mat, ffun)
    byKoA=which(wh1 == "FALSE")
    byKoA=names(byKoA)

    if(length(byKoA) == 0){ mock = chch}
    if(length(byKoA) > 0){ mock = chch[-(which(row.names(chch) %in% byKoA)), ]}
    # Filter based on delta between last 1/3 and first 1/2 samples
    # the gene with (variances of delta) < 1 got filtered
    mock=estimateSizeFactors(mock)
    normcounts=assay(mock,normalized=T)
    dlta=normcounts[,(ncol(mock)/2+1):(ncol(mock))]-normcounts[,1:(ncol(mock)/2)]
    vars=log10(rowVars(dlta))
    names(vars)=row.names(dlta)
    #hist(vars,100)
    byVar=unique(names(which(vars < 0)))

    # Filter based on read count variances
    # the gene with (variances of expression) < 1 or standard deviation < 1 got filtered
    mock <- chch[-(which(row.names(chch) %in% c(byKoA,byVar))), ]
    mock=estimateSizeFactors(mock)
    mat=assay(mock,normalized=T)
    vars=rowVars(mat)
    names(vars)=row.names(mat)
    madds=rowMads(mat)
    names(madds)=row.names(mat)
    #hist(log10(vars),100)
    #hist(log10(madds),100,main="Histogram of mads")
    byVar2=c(names(which(log10(vars) < 0)),names(which(log10(madds) <0)))

    myfilter=unique(c(byKoA,byVar,byVar2))
    return(myfilter)
}

# Gene symbol search function
ENTREZtoSYMBOL <- function(x){
  vec=mapIds(org.Hs.eg.db,keys=x,column="SYMBOL",keytype="ENTREZID",multiVals="first")
  repl=which(is.na(vec))
  for(y in repl){vec[y]=x[y]}
  return(vec)
}

# Result filter function: no NA & FDR < 0.2
ResFiltered <- function(x){
  temp=na.omit(x)
  temp=temp[which(temp$padj < 0.2),]
  temp$symbol=ENTREZtoSYMBOL(row.names(temp))
  return(temp)
}

##########################################################
# Build object and run DEG
##########################################################
# Read in matrix and Sample annotation
mat <- read.delim("CountData.BMS038.txt")

SampleTableCorrected <- read.csv("/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/bms038_analysis-master/data/SampleTableCorrected.9.19.16.csv", row.names=1)

# Only samples with response
SampleTableCorrected <- SampleTableCorrected[!(is.na(SampleTableCorrected$Response)), ]

# Find the overlapping samples
inter <- intersect(colnames(mat),rownames(SampleTableCorrected))
SampleTableCorrected <- SampleTableCorrected[inter,]
####Add the variable of ICN AID
SampleTableCorrected$CIN_AID_binary<-Riaz_Clinical3$CIN_AID_binary[match(SampleTableCorrected$PatientID,Riaz_Clinical3$Tumor_Sample_Barcode)]
  ##Subset to samples having the CIN value
SampleTableCorrected<-SampleTableCorrected[!is.na(SampleTableCorrected$CIN_AID_binary),]
SampleTableCorrected<-SampleTableCorrected[SampleTableCorrected$Cohort=="NIV3-NAIVE",]##Subset to only Naive samples since the effect was seen in those only...

mat <- mat[,match(rownames(SampleTableCorrected),colnames(mat))]

# Create the DESeq2 object
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
ebg <- exonsBy(txdb, by="gene")
intersection <- intersect(rownames(mat),ebg@partitioning@NAMES)
ebg2 <- ebg[ebg@partitioning@NAMES %in% intersection]

# sort by ID
ebg2 <- ebg2[order(names(ebg2))]

# Sort by gene model order
mat <- mat[match(names(ebg2), rownames(mat)),]

# Create object
ddsMat <- DESeqDataSetFromMatrix(countData = mat,
                                  colData = SampleTableCorrected,
                                  design = ~ 1)
# Assign genomic range
rowRanges(ddsMat) <- ebg2

# Get teh gene symbol
mcols(ddsMat)$symbols <-  mapIds(org.Hs.eg.db,
                                       keys=rownames(ddsMat),
                                       column="SYMBOL",
                                       keytype="ENTREZID",
                                       multiVals="first")

# No On samples
dds.pre <- ddsMat[,!(ddsMat$PreOn=="On")]

# Exclude 4 samples
(dds.pre <- dds.pre[,-which( as.character(dds.pre$Sample) %in% c("Pt48_Pre","Pt67_Pre","Pt52_Pre","Pt27_Pre"))])

# Reset the levels of colData factor variables
colData(dds.pre) <- DataFrame(droplevels(data.frame(colData(dds.pre))))

# Filter out genes
myfilter <- filterthegenes(dds.pre)
dds.pre <- dds.pre[ -which(rownames(dds.pre) %in% myfilter), ]

# Set the level of response using the CIN_AID_binary
dds.pre$CIN_AID_binary <- factor(dds.pre$CIN_AID_binary,levels=c("Low","High")) #High vs Low

# Set the DEG design
design(dds.pre) <- ~CIN_AID_binary

# Calculate size factor
dds.pre <- estimateSizeFactors(dds.pre)

# Run the DEG analysis
dds.pre <- DESeq(dds.pre)

# Get the result
Response.ihw.ICNAID <- ResFiltered(results(dds.pre,filterFun = ihw))
Response.ihw.ICNAID <- Response.ihw.ICNAID[order(Response.ihw.ICNAID$padj),]

#174 genes...

write.table(Response.ihw.ICNAID, 'Pre_ICNAID_highvsLow_DEG.csv', sep=",", quote=F, col.names=NA)

##Get the normalized counts for heatmaps...
vsd_dds.pre<- vst(dds.pre, blind=FALSE)

Pre_normalized<-assay(vsd_dds.pre)
Pre_normalized<-Pre_normalized[rownames(Pre_normalized)%in%c(rownames(Response.ihw.ICNAID),"5133"),] #Only DE genes and PD1
rownames(Pre_normalized)<-RNA_counts_Riaz$HUGO[match(rownames(Pre_normalized),RNA_counts_Riaz$Row)] #Change to Hugo SYmbol


#####Make a heatmap with these data...
##Transform expression to Z-score
###
heat_dat_exp<-t(Pre_normalized)#transpose data
heat_dat_exp<-normalizeQuantiles(heat_dat_exp)
#Log2 transformation
heat_dat_exp<-log2(heat_dat_exp)
#z-score normalization using the scale function
## Remember   z = ( x - μ ) / σ
heat_dat_exp<-scale(heat_dat_exp)#Transform data to Z-score
heat_dat_exp<-t(heat_dat_exp)
###Prepare annotation for heatmap
heat_dat<-SampleTableCorrected[SampleTableCorrected$PreOn=="Pre",]
heat_dat<-heat_dat[intersect(colnames(heat_dat_exp),heat_dat$Sample),] #Same samples
heat_dat$OS<-Riaz_Clinical3$OS[match(heat_dat$PatientID,Riaz_Clinical3$Tumor_Sample_Barcode)]#Add survival
##Reorder
heat_dat<-heat_dat %>% arrange(desc(CIN_AID_binary),OS, (PatientID))

##Reorder exp data
heat_dat_exp<-heat_dat_exp[,heat_dat$Sample]

col_fun<-colorRamp2(c(0, 1.6, 3.2), c("white", "cadetblue2", "dodgerblue2"))

library(ComplexHeatmap)
ha3 <- HeatmapAnnotation(Survival=heat_dat$OS[],ICN_AID=heat_dat$CIN_AID_binary[],Response=heat_dat$Response[],col = list(Survival=col_fun,ICN_AID=c("High" = "darkred", "Low" = "seagreen"),Response= c("PRCR" = "limegreen", "PD" = "orange2","SD"="blueviolet")),
                         annotation_legend_param = list(Survival=list(title = "Survival years",at=c(0,1.6,3.2),labels=c("0","1.6","3.2")),ICN_AID = list(
                           title = "ICN AICDA load",
                           at = c("High", "Low"),
                           labels = c("High", "Low")
                         ),Response = list(
                           title = "Response",
                           at = c("PRCR", "PD","SD"),
                           labels = c("CR/PR", "PD","SD")
                         )))
  

  

col_fun2 = colorRamp2(c(-4, 0, 4), c("#440154FF","#21908CFF","yellow"))
#col_fun<-colorRamp2(c(0, 6, 12), c("white", alpha(colour = "dodgerblue2",alpha = 0.5), "dodgerblue2"))

pre_heatmap<-  Heatmap(heat_dat_exp,show_row_names = F ,name = "Z-score",column_title = paste0("Pre Ipi-Naive (n = ", ncol(heat_dat_exp),")"),show_column_dend = FALSE,show_row_dend =T, cluster_columns = FALSE,cluster_rows = T,top_annotation = ha3,column_title_gp = gpar(fontsize = 18, fontface = "bold",col="black"),column_names_gp = gpar(fontsize=8),
              #column_split = data.frame(c(rep("Both",ncol(heat_dat_exp)/2),rep("Transcript",ncol(heat_dat_exp)/2) )) ,
              cluster_row_slices = FALSE,  cluster_column_slices = FALSE,row_gap = unit(c( 2), "mm"),border = TRUE,show_column_names = T,row_split = 2,left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:3),labels = c("Cluster 1", "Cluster 2"), labels_gp = gpar(col = "white", fontsize = 10))),heatmap_width = unit(10, "cm"), heatmap_height = unit(10, "cm"),col=col_fun2,row_title = "")





###Pathway analysis separating upregulated from downregulated
#####Extract cluster 1 = Upregulated  and Cluster2 (downRegulated)
pre_up<-rownames(heat_dat_exp)[row_order(pre_heatmap)[[1]]] #62 up
pre_down<-rownames(heat_dat_exp)[row_order(pre_heatmap)[[2]]] #112 down

write.table(unique(pre_up),file = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/genes_AICDA_preUp.tab",row.names = F,quote = F,sep = "\t",col.names = F)

write.table(unique(pre_down),file = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/genes_AICDA_preDown.tab",row.names = F,quote = F,sep = "\t",col.names = F)

##############""
########ClusterProfiler Analysis
library(msigdbr)
library(clusterProfiler)
library(DOSE)
#Prepare genelist object (gene IDs must be in entrez gene id type)
for_cluster<- data.frame(ID= rownames(Response.ihw.ICNAID), log2FoldChange=Response.ihw.ICNAID$log2FoldChange)

geneList <- for_cluster[,2]
names(geneList) <- as.character(for_cluster[,1])
geneList <- sort(geneList, decreasing = TRUE)

gene <- names(geneList)[abs(geneList) > 0]
head(gene)
length(gene)
#174
#Get enrichment from different databases and plot
library(enrichplot)

edoGO<-enrichGO(gene, 'org.Hs.eg.db', ont="ALL",pvalueCutoff = .2,qvalueCutoff = .2,minGSSize = 4) #GO dataset for all subontologies (chose "BP" for simplify function)
barplot(edoGO, showCategory=30) #No enrichment found
edoKEGG<-enrichKEGG(gene,pvalueCutoff = .2,qvalueCutoff = .2) #KEGG dataset
barplot(edoKEGG, showCategory=30)

kk2 <- gseKEGG(geneList     = geneList,
               organism     = 'hsa',
               nPerm        = 1000,
               minGSSize    = 5,
               pvalueCutoff = .2,qvalueCutoff = .2,
               verbose      = FALSE)
####No enrichment...
m_t2g <- msigdbr(species = "Homo sapiens", category = "C7") %>% 
  dplyr::select(gs_name, entrez_gene)
head(m_t2g) # oncogenic signatures
em <- enricher(gene, TERM2GENE=m_t2g,minGSSize = 5,pvalueCutoff = .2)
em2 <- GSEA(geneList, TERM2GENE = m_t2g,minGSSize = 5,pvalueCutoff = .2)
head(em2)
em2 <- setReadable(em2, 'org.Hs.eg.db', 'ENTREZID')
cnetplot(em2, categorySize="pvalue",foldChange=geneList)


####Low numbers of genes for enrichment...try ingenuity pathway analysis..

###Manually cite within the paper important genes

#Re do the heatmap showing the labels of those important genes
labels = data.frame(labels=rownames(heat_dat_exp),colors=rownames(heat_dat_exp))
labels$colors<-"black"
# Set the rest of genes to ""
genelist<-c("CD8B","HLA-DRB1","CREB1","TRAF2","LITAF","VCAM1","MYD88") #HSPA1B out to avoid not showing.., downgenes
genelist<-c(genelist,"IL1R1","NOTCH3") #Upgenes
labels$labels[!labels$labels %in% genelist] <- "" 
#labels$colors[labels$labels %in% c("CD8B","HLA-DRB1")] <- "blue" #Antigen presentation
#labels$colors[labels$labels %in% c("CREB1","TRAF2","LITAF","VCAM1")] <- "lightcoral" #TNF signaling



##This is Supplementary Figure 23B

pre_heatmap<- Heatmap(heat_dat_exp,show_row_names = T ,name = "Z-score",column_title = paste0("Pre Ipi-Naive (n = ", ncol(heat_dat_exp),")"),show_column_dend = FALSE,show_row_dend =T, cluster_columns = FALSE,cluster_rows = T,top_annotation = ha3,column_title_gp = gpar(fontsize = 18, fontface = "bold",col="black"),column_names_gp = gpar(fontsize=8),
              #column_split = data.frame(c(rep("Both",ncol(heat_dat_exp)/2),rep("Transcript",ncol(heat_dat_exp)/2) )) ,
              cluster_row_slices = FALSE,  cluster_column_slices = FALSE,row_gap = unit(c( 2), "mm"),border = TRUE,show_column_names = T,row_split = 2,left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:3),labels = c("Cluster 1", "Cluster 2"), labels_gp = gpar(col = "white", fontsize = 10))),heatmap_width = unit(12, "cm"), heatmap_height = unit(12, "cm"),col=col_fun2,row_title = "",row_labels=labels$labels,row_names_gp = gpar(fontsize=8,col = labels$colors))


#########

###############
##########
############"
####Now compare between these to groups but also and the pre-on condition...
# Use ALL samples in analysis
dds.onpre <- ddsMat


dds.onpre$PreOn <- factor(dds.onpre$PreOn,levels= unique(dds.onpre$PreOn)) #High vs Low

# Set Pre as the reference level
dds.onpre$PreOn <- relevel(dds.onpre$PreOn,"Pre")

# Drop patient ID level which is not included
dds.onpre$PatientID <-  factor(dds.onpre$PatientID,levels=unique(dds.onpre$PatientID))
dds.onpre$PatientID <- droplevels(dds.onpre$PatientID)

# Set the level of response using the CIN_AID_binary
dds.onpre$CIN_AID_binary <- factor(dds.onpre$CIN_AID_binary,levels=c("Low","High")) #High vs Low


# Set the design with controling patient and AICDA
design(dds.onpre)<- ~  CIN_AID_binary+PreOn

# Calculate the sample size factor
dds.onpre <- estimateSizeFactors(dds.onpre)

# Run the DEG analysis
dds.onpre <- DESeq(dds.onpre)

# Get the result
ResFiltered2 <- function(x){
  temp=na.omit(x)
  temp=temp[which(temp$padj < 0.05),]
  temp$symbol=ENTREZtoSYMBOL(row.names(temp))
  return(temp)
}
Response.results.ONvsPRE <- ResFiltered(results(dds.onpre, contrast=c("CIN_AID_binary","High","Low")))
Response.results.ONvsPRE <- Response.results.ONvsPRE[order(Response.results.ONvsPRE$padj),]

write.table(Response.results.ONvsPRE, '/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/Pre_vs_ON_ICNAID_DEG.csv', sep=",", quote=F, col.names=NA)

##Get the normalized counts for heatmaps...
vsd_dds.preON<- vst(dds.onpre, blind=FALSE)

PreON_normalized<-assay(vsd_dds.preON)
PreON_normalized<-PreON_normalized[rownames(PreON_normalized)%in%c(rownames(Response.results.ONvsPRE)),] #Only DE genes and PD1
rownames(PreON_normalized)<-RNA_counts_Riaz$HUGO[match(rownames(PreON_normalized),RNA_counts_Riaz$Row)] #Change to Hugo SYmbol

############
##Transform expression to Z-score
###
heat_dat_exp<-t(PreON_normalized)#transpose data
heat_dat_exp<-normalizeQuantiles(heat_dat_exp)
#Log2 transformation
heat_dat_exp<-log2(heat_dat_exp)
#z-score normalization using the scale function
## Remember   z = ( x - μ ) / σ
heat_dat_exp<-scale(heat_dat_exp)#Transform data to Z-score
heat_dat_exp<-t(heat_dat_exp)
###Prepare annotation for heatmap
heat_dat<-SampleTableCorrected[,]

heat_dat$OS<-Riaz_Clinical3$OS[match(heat_dat$PatientID,Riaz_Clinical3$Tumor_Sample_Barcode)]#Add survival
##Reorder
heat_dat<-heat_dat %>% arrange(desc(CIN_AID_binary),OS, (PatientID))

##Reorder exp data
heat_dat_exp<-heat_dat_exp[,heat_dat$Sample]

heat_dat_exp[rownames(heat_dat_exp)=="CXCL13",]

col_fun<-colorRamp2(c(0, 1.6, 3.2), c("white", "cadetblue2", "dodgerblue2"))


library(ComplexHeatmap)
ha3 <- HeatmapAnnotation(Survival=heat_dat$OS[],ICN_AID=heat_dat$CIN_AID_binary[],PreOn=heat_dat$PreOn[],Response=heat_dat$Response[],col = list(PreOn= c("On" = "darkblue", "Pre" = "firebrick1"),Survival=col_fun,ICN_AID=c("High" = "darkred", "Low" = "seagreen"),Response= c("PRCR" = "limegreen", "PD" = "orange2","SD"="blueviolet")),
                         annotation_legend_param = list(PreOn = list(
                           title = "Therapy moment",
                           at = c("Pre", "On"),
                           labels = c("Before", "During")
                         ),Survival=list(title = "Survival years",at=c(0,1.6,3.2),labels=c("0","1.6","3.2")),ICN_AID = list(
                           title = "ICN AICDA load",
                           at = c("High", "Low"),
                           labels = c("High", "Low")
                         ),Response = list(
                           title = "Response",
                           at = c("PRCR", "PD","SD"),
                           labels = c("CR/PR", "PD","SD")
                         )))

col_fun2 = colorRamp2(c(-4, 0, 4), c("#440154FF","#21908CFF","yellow"))

PreOn_heatmap<-  Heatmap(heat_dat_exp,show_row_names = F ,name = "Z-score",column_title = paste0("Pre-On Ipi-Naive (n = ", ncol(heat_dat_exp),")"),show_column_dend = FALSE,show_row_dend =T, cluster_columns = FALSE,cluster_rows = T,top_annotation = ha3,column_title_gp = gpar(fontsize = 18, fontface = "bold",col="black"),column_names_gp = gpar(fontsize=8),
              #column_split = data.frame(c(rep("Both",ncol(heat_dat_exp)/2),rep("Transcript",ncol(heat_dat_exp)/2) )) ,
              cluster_row_slices = FALSE,  cluster_column_slices = FALSE,row_gap = unit(c( 2), "mm"),border = TRUE,show_column_names = T,row_split = 2,left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:3),labels = c("C1", "C2"), labels_gp = gpar(col = "white", fontsize = 10))),heatmap_width = unit(12, "cm"), heatmap_height = unit(12, "cm"),col=col_fun2,row_title = "")

###Pathway analysis separating upregulated from downregulated
#####Extract cluster 1 = Upregulated  and Cluster2 (downRegulated)
preON_up<-rownames(heat_dat_exp)[row_order(PreOn_heatmap)[[1]]] # up
preON_down<-rownames(heat_dat_exp)[row_order(PreOn_heatmap)[[2]]] # down

write.table(unique(preON_up),file = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/genes_AICDA_preUp.tab",row.names = F,quote = F,sep = "\t",col.names = F)

write.table(unique(preON_down),file = "/media/user/seagate_ICM/AICDA_analysis/NeoAgs/Riaz_2017/genes_AICDA_preDown.tab",row.names = F,quote = F,sep = "\t",col.names = F)



########ClusterProfiler Analysis
#Prepare genelist object (gene IDs must be in entrez gene id type)
for_cluster<- data.frame(ID= rownames(Response.results.ONvsPRE), log2FoldChange=Response.results.ONvsPRE$log2FoldChange)

geneList <- for_cluster[,2]
names(geneList) <- as.character(for_cluster[,1])
geneList <- sort(geneList, decreasing = TRUE)

gene <- names(geneList)[abs(geneList) > 0]
head(gene)
length(gene)
#811
#Get enrichment from different databases and plot


edoGO<-enrichGO(gene, 'org.Hs.eg.db', ont="BP",pvalueCutoff = .05,qvalueCutoff = .05) #GO dataset for all subontologies (chose "BP" for simplify function)
barplot(edoGO, showCategory=30) #No enrichment found
edoGOx <- setReadable(edoGO, 'org.Hs.eg.db', 'ENTREZID')  ##For the GO enrichment
cnetplot(edoGOx, foldChange=geneList, showCategory = 10)
edoGO_simplyfied <- simplify(edoGO, cutoff=0.7, by="p.adjust", select_fun=min)
edoGOx_simplyfied <- setReadable(edoGO_simplyfied, 'org.Hs.eg.db', 'ENTREZID')
cnetplot(edoGOx_simplyfied, foldChange=geneList, showCategory = 10)
##Still to many genes? Get a better view by: Enrichment map  which organizes enriched terms into a network with edges connecting overlapping gene sets. In this way, mutually overlapping gene sets are tend to cluster together, making it easy to identify functional module.
ego2 <- pairwise_termsim(edoGO, method="Wang")
    emapplot(ego2)

    
GO_allGenes<- dotplot(edoGOx_simplyfied, showCategory=30) + ggtitle("GSEA High ICN AID")


edoKEGG<-enrichKEGG(gene,pvalueCutoff = .2,qvalueCutoff = .2) #KEGG dataset
barplot(edoKEGG, showCategory=30)

kk2 <- gseKEGG(geneList     = geneList,
               organism     = 'hsa',
               minGSSize    = 5,
               pvalueCutoff = .2,
               verbose      = FALSE)
cnetplot(kk2, categorySize="pvalue",foldChange=geneList)

####No enrichment...



m_t2g <- msigdbr(species = "Homo sapiens", category = "C7") %>% 
  dplyr::select(gs_name, entrez_gene)
head(m_t2g) # oncogenic signatures
em <- enricher(gene, TERM2GENE=m_t2g,minGSSize = 5,pvalueCutoff = .05)

em <- setReadable(em, 'org.Hs.eg.db', 'ENTREZID')
dotplot(em, showCategory=30) + ggtitle("GSEA High ICN AID")
cnetplot(em, categorySize="pvalue",foldChange=geneList,showCategory = 10)



####Do separating by clusters...

preON_up<-Response.results.ONvsPRE[Response.results.ONvsPRE$symbol%in%preON_up,]
preON_down<-Response.results.ONvsPRE[Response.results.ONvsPRE$symbol%in%preON_down,]

compare_data<-list(Cluster_1=rownames(preON_up),Cluster_2=rownames(preON_down))

ck <- compareCluster(geneCluster = compare_data, fun = "enrichGO",ont="BP",OrgDb='org.Hs.eg.db')
head(as.data.frame(ck))

ck_simplyfied <- simplify(ck, cutoff=0.7, by="p.adjust", select_fun=min)


GESEA_p1<-dotplot(ck_simplyfied,showCategory=20)+ ggtitle("GSEA High ICN AID")+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))


ck_p<-pairwise_termsim(ck_simplyfied, method = "JC", semData = NULL, showCategory = 200)

ck_terms<-ck_simplyfied@compareClusterResult$Description[c(3,5,6,9,10,19,21,24,48,57,64,86,116,133)]


GESEA_p2<-emapplot(ck_p,showCategory = 30,color = "p.adjust")+ ggtitle("GSEA High ICN AID")+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))

###Manually cite within the paper important genes

#Re do the heatmap showing the labels of those important genes

labels = data.frame(labels=rownames(heat_dat_exp),colors=rownames(heat_dat_exp))
labels$colors<-"black"

# Set the rest of genes to ""
genelist<-c("CXCL13","CCR5","PRKCQ","STK24","ELK3") #.., upGenes#"ABCC6"
genelist<-c(genelist,"CD19","CXCL11","MAP3K13","MTOR") 


labels$labels[!labels$labels %in% genelist] <- "" 


##############
###########
##Heatmap

PreOn_heatmap<- Heatmap(heat_dat_exp,show_row_names = T ,name = "Z-score",column_title = paste0("Pre_On Ipi-Naive (n = ", ncol(heat_dat_exp),")"),show_column_dend = FALSE,show_row_dend =T, cluster_columns = FALSE,cluster_rows = T,top_annotation = ha3,column_title_gp = gpar(fontsize = 18, fontface = "bold",col="black"),column_names_gp = gpar(fontsize=8),
              #column_split = data.frame(c(rep("Both",ncol(heat_dat_exp)/2),rep("Transcript",ncol(heat_dat_exp)/2) )) ,
              cluster_row_slices = FALSE,  cluster_column_slices = FALSE,row_gap = unit(c( 2), "mm"),border = TRUE,show_column_names = T,row_split = 2,left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:3),labels = c("Cluster 1", "Cluster 2"), labels_gp = gpar(col = "white", fontsize = 10))),heatmap_width = unit(15, "cm"), heatmap_height = unit(15, "cm"),col=col_fun2,row_title = "",row_labels=labels$labels,row_names_gp = gpar(fontsize=8,col = labels$colors))

##
##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")
####Import Data

heat_dat_exp<-as.data.frame(read.delim("Data/Figure6/Figure6E_1A.tab"))
heat_dat_exp<-heat_dat_exp[!is.na(heat_dat_exp$Gene),]
rownames(heat_dat_exp)<-heat_dat_exp$Gene
heat_dat_exp<-heat_dat_exp[,-1]
heat_dat<-as.data.frame(read.delim("Data/Figure6/Figure6E_1B.tab"))


##############
###########
##Heatmap

col_fun<-colorRamp2(c(0, 1.6, 3.2), c("white", "cadetblue2", "dodgerblue2"))


ha3 <- HeatmapAnnotation(Survival=heat_dat$OS[],ICN_AID=heat_dat$CIN_AID_binary[],PreOn=heat_dat$PreOn[],Response=heat_dat$Response[],col = list(PreOn= c("On" = "darkblue", "Pre" = "firebrick1"),Survival=col_fun,ICN_AID=c("High" = "darkred", "Low" = "seagreen"),Response= c("PRCR" = "limegreen", "PD" = "orange2","SD"="blueviolet")),
                         annotation_legend_param = list(PreOn = list(
                           title = "Therapy moment",
                           at = c("Pre", "On"),
                           labels = c("Before", "During")
                         ),Survival=list(title = "Survival years",at=c(0,1.6,3.2),labels=c("0","1.6","3.2")),ICN_AID = list(
                           title = "ICN AICDA load",
                           at = c("High", "Low"),
                           labels = c("High", "Low")
                         ),Response = list(
                           title = "Response",
                           at = c("PRCR", "PD","SD"),
                           labels = c("CR/PR", "PD","SD")
                         )))

col_fun2 = colorRamp2(c(-4, 0, 4), c("#440154FF","#21908CFF","yellow"))


labels = data.frame(labels=rownames(heat_dat_exp),colors=rownames(heat_dat_exp))
labels$colors<-"black"

# Set the rest of genes to ""
genelist<-c("CXCL13","CCR5","PRKCQ","STK24","ELK3") #.., upGenes#"ABCC6"
genelist<-c(genelist,"CD19","CXCL11","MAP3K13","MTOR") 


labels$labels[!labels$labels %in% genelist] <- "" 



PreOn_heatmap<- Heatmap(heat_dat_exp,show_row_names = T ,name = "Z-score",column_title = paste0("Pre_On Ipi-Naive (n = ", ncol(heat_dat_exp),")"),show_column_dend = FALSE,show_row_dend =T, cluster_columns = FALSE,cluster_rows = T,top_annotation = ha3,column_title_gp = gpar(fontsize = 18, fontface = "bold",col="black"),column_names_gp = gpar(fontsize=8),
              #column_split = data.frame(c(rep("Both",ncol(heat_dat_exp)/2),rep("Transcript",ncol(heat_dat_exp)/2) )) ,
              cluster_row_slices = FALSE,  cluster_column_slices = FALSE,row_gap = unit(c( 2), "mm"),border = TRUE,show_column_names = T,row_split = 2,left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:3),labels = c("Cluster 1", "Cluster 2"), labels_gp = gpar(col = "white", fontsize = 10))),heatmap_width = unit(15, "cm"), heatmap_height = unit(15, "cm"),col=col_fun2,row_title = "",row_labels=labels$labels,row_names_gp = gpar(fontsize=8,col = labels$colors))




####Cluster profiler graphs
#Import
preON_up<-as.data.frame(read.delim("Data/Figure6/Figure6E_2A.tab"))
preON_down<-as.data.frame(read.delim("Data/Figure6/Figure6E_2B.tab"))

compare_data<-list(Cluster_1=rownames(preON_up),Cluster_2=rownames(preON_down))

#Takes some minutes to compute...!
ck <- compareCluster(geneCluster = compare_data, fun = "enrichGO",ont="BP",OrgDb='org.Hs.eg.db')


ck_simplyfied <- simplify(ck, cutoff=0.7, by="p.adjust", select_fun=min)



ck_p<-pairwise_termsim(ck_simplyfied, method = "JC", semData = NULL, showCategory = 200)

GESEA_p2<-emapplot(ck_p,showCategory = 30,color = "p.adjust")+ ggtitle("GSEA High ICN AID")+theme(plot.title = element_text(hjust = 0.5,face = "bold",size = 15))


###Plot
PreOn_heatmap

GESEA_p2


Fig 6F

##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")

#####Graph most important genes in pre and prevsON
###Table
volcano_pre<-data.frame(FC=Response.ihw.ICNAID$log2FoldChange,padj=Response.ihw.ICNAID$padj,Gene=Response.ihw.ICNAID$symbol,GSEA="Other",Group="Pre")
volcano_pre$pval_mod<-log10(volcano_pre$padj) #For plotting
volcano_pre$GSEA[volcano_pre$Gene%in%c("CREB1","HLA-DRB1","HSPA1B")]<-"antigen processing and presentation"
volcano_pre$GSEA[volcano_pre$Gene%in%c("GPR37", "HSF1",  "MYB" ,  "NOL3",  "SGK2")]<-"response to oxidative stress"
volcano_pre$GSEA[volcano_pre$Gene%in%c("VCAM1", "CD19")]<-"B cell differentiation"
volcano_pre$label<-""
volcano_pre$label<-ifelse(volcano_pre$GSEA!="Other",volcano_pre$Gene,"")


volcano_on<-data.frame(FC=Response.results.ONvsPRE$log2FoldChange,padj=Response.results.ONvsPRE$padj,Gene=Response.results.ONvsPRE$symbol,GSEA="Other",Group="ON")
###Lavel some genes involved in Metabolic processes for color in GESEA
#Reduce some terms
ck_terms2<-ck_terms[c(1,3,4,7,12,13,14)]
for (i in 1:length(ck_terms2)) {
  ##Extract genes
  labels = data.frame(labels=Response.results.ONvsPRE$symbol,colors=rownames(Response.results.ONvsPRE),entrez=rownames(Response.results.ONvsPRE))
tmp_tag<-ck_p@compareClusterResult$geneID[ck_p@compareClusterResult$Description%in%ck_terms2[i]]
tmp_tag<-strsplit(tmp_tag,split = "/")[[1]]
labels<-labels$labels[labels$entrez%in%tmp_tag]
volcano_on$GSEA<-ifelse(volcano_on$Gene%in%labels,ck_terms2[i],volcano_on$GSEA)

}
volcano_on$GSEA<-ifelse(volcano_on$Gene%in%c("CXCL13","CCR5"),"T-cell reactivity to clonal neoepitopes",volcano_on$GSEA)
volcano_on$GSEA<-ifelse(volcano_on$Gene%in%c("PRKCQ"),"T cell activation",volcano_on$GSEA)

volcano_on$pval_mod<- -log10(volcano_on$padj) #For plotting
volcano_on$label<-ifelse(volcano_on$GSEA!="Other",volcano_on$Gene,"")



volcano_combined<-rbind(volcano_pre,volcano_on)

volcano_combined$GSEA <- factor(volcano_combined$GSEA, levels=c("T-cell reactivity to clonal neoepitopes","antigen processing and presentation","T cell activation","leukocyte cell-cell adhesion","response to oxidative stress","cell growth","B cell differentiation","Other"))
###Volcano plot
library(ggrepel)
##Import data
data_dir<-"/media/user/seagate_ICM/AICDA_analysis/Github_repository/" #Set to where you downloaded the data
setwd(data_dir)
source("r/prerequisites.R")

##Import data....
volcano_pre<-as.data.frame(read.delim("Data/Figure6/Figure6F_1.tab"))
volcano_on<-as.data.frame(read.delim("Data/Figure6/Figure6F_2.tab"))
volcano_combined<-rbind(volcano_pre,volcano_on)

volcano_combined$GSEA <- factor(volcano_combined$GSEA, levels=c("T-cell reactivity to clonal neoepitopes","antigen processing and presentation","T cell activation","leukocyte cell-cell adhesion","response to oxidative stress","cell growth","B cell differentiation","Other"))
###Volcano plot

p_preOn_gsea1<-ggplot(data=volcano_combined, aes(y=FC, x=pval_mod, col=GSEA,shape=Group)) +geom_point(position= position_quasirandom()) +theme_std(base_size=14) + scale_color_manual(name="Pathway",values = c("red","plum3",scales::alpha("grey", alpha = 0.3)))+geom_vline(xintercept=c(-log10(0.2),log10(0.2)), col="black",linetype="dotted") +geom_hline(yintercept=0, col="black",linetype="dotted")+xlab("FDR \nPre<---------|--------->On")+theme(legend.position = "right",legend.box = "vertical") +ylab("Log2(fold-change)")+geom_text_repel(data = volcano_combined,aes(label = label,x=pval_mod, y=FC),  size = 5, box.padding = unit(0.35, "lines"),point.padding = unit(0.3, "lines"),inherit.aes = F,max.overlaps = 90)+ theme(legend.title = element_text(colour="black", size=10,face="bold"),legend.text = element_text(colour="black", size=8))+scale_shape_manual(name="Time Group",values = c(17,16),labels=c(paste0("On (Up =",table(volcano_on$FC>0)[2],"; Down =",table(volcano_on$FC>0)[1],")" ),paste0("Pre (Up =",table(volcano_pre$FC>0)[2],"; Down =",table(volcano_pre$FC>0)[1],")" )))+scale_y_continuous(limits=c(20,26),breaks=seq(20,26,by=2))+scale_x_continuous(limits=c(8.5,14),breaks=seq(8.5,14,by=2)) + labs(x=NULL,y=NULL)+theme(legend.position='none',axis.line.x=element_blank(),axis.ticks.x=element_blank(),axis.text.x=element_blank(),axis.line.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y=element_blank())

p_yaxis<-ggplot(data=volcano_combined, aes(y=FC, x=pval_mod, col=GSEA,shape=Group)) +geom_point(position= position_quasirandom()) +theme_std(base_size=14) + scale_color_manual(name="Pathway",values = c(rep("white",8)))+geom_vline(xintercept=c(-log10(0.2),log10(0.2)), col="black",linetype="dotted") +geom_hline(yintercept=0, col="black",linetype="dotted")+xlab("FDR \nPre<---------|--------->On")+theme(legend.position = "right",legend.box = "vertical") +ylab("Log2(fold-change)")+ labs(x=NULL,y="")+scale_y_continuous(limits=c(20,26),breaks=seq(20,26,by=2))+theme(legend.position='none',axis.line.x=element_blank(),axis.ticks.x=element_blank(),axis.text.x=element_blank())+scale_x_continuous(limits=c(-3,6),breaks=seq(-3,6,by=2))  

p_xaxis<-ggplot(data=volcano_combined, aes(y=FC, x=pval_mod, col=GSEA,shape=Group)) +geom_point(position= position_quasirandom()) +theme_std(base_size=14) + scale_color_manual(name="Pathway",values = c(rep("white",8)))+geom_vline(xintercept=c(-log10(0.2),log10(0.2)), col="black",linetype="dotted") +geom_hline(yintercept=0, col="black",linetype="dotted")+xlab("FDR \nPre<---------|--------->On")+theme(legend.position = "right",legend.box = "vertical") +ylab("Log2(fold-change)")+ labs(x=NULL,y=NULL)+scale_y_continuous(limits=c(-8,10),breaks=seq(-8,10,by=2))+theme(legend.position='none',axis.line.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y=element_blank())+scale_x_continuous(limits=c(8.5,14),breaks=seq(8.5,14,by=2))



p_preOn_gsea2<-ggplot(data=volcano_combined, aes(y=FC, x=pval_mod, col=GSEA,shape=Group)) +geom_point(position= position_quasirandom()) +theme_std(base_size=14) + scale_color_manual(name="Pathway",values = c("chocolate1"))+geom_vline(xintercept=c(-log10(0.2),log10(0.2)), col="black",linetype="dotted") +geom_hline(yintercept=0, col="black",linetype="dotted")+xlab("FDR \nPre<---------|--------->On")+theme(legend.position = "right",legend.box = "vertical") +ylab("Log2(fold-change)")+geom_text_repel(data = volcano_combined,aes(label = label,x=pval_mod, y=FC),  size = 5, box.padding = unit(0.35, "lines"),point.padding = unit(0.3, "lines"),inherit.aes = F,max.overlaps = 90)+ theme(legend.title = element_text(colour="black", size=10,face="bold"),legend.text = element_text(colour="black", size=8))+scale_shape_manual(name="Time Group",values = c(17,16),labels=c(paste0("On (Up =",table(volcano_on$FC>0)[2],"; Down =",table(volcano_on$FC>0)[1],")" ),paste0("Pre (Up =",table(volcano_pre$FC>0)[2],"; Down =",table(volcano_pre$FC>0)[1],")" )))+scale_y_continuous(limits=c(20,26),breaks=seq(20,26,by=2))+scale_x_continuous(limits=c(24,26),breaks=seq(24,26,by=2)) + labs(x=NULL,y=NULL)+theme(legend.position='none',axis.line.x=element_blank(),axis.ticks.x=element_blank(),axis.text.x=element_blank(),axis.line.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y=element_blank()) 


p_xaxis2<-ggplot(data=volcano_combined, aes(y=FC, x=pval_mod, col=GSEA,shape=Group)) +geom_point(position= position_quasirandom()) +theme_std(base_size=14) + scale_color_manual(name="Pathway",values = c(rep("white",8)))+geom_vline(xintercept=c(-log10(0.2),log10(0.2)), col="black",linetype="dotted") +geom_hline(yintercept=0, col="black",linetype="dotted")+xlab("FDR \nPre<---------|--------->On")+theme(legend.position = "right",legend.box = "vertical") +ylab("Log2(fold-change)")+ labs(x=NULL,y=NULL)+scale_y_continuous(limits=c(-8,10),breaks=seq(-8,10,by=2))+theme(legend.position='none',axis.line.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y=element_blank())+scale_x_continuous(limits=c(24,26),breaks=seq(24,26,by=2))

  


p_preOn_gsea3<-ggplot(data=volcano_combined, aes(y=FC, x=pval_mod, col=GSEA,shape=Group)) +geom_point(position= position_quasirandom()) +theme_std(base_size=14) + scale_color_manual(name="Pathway",values = c("red","skyblue1","chocolate1","darkgreen","plum3","sienna4","lightseagreen",scales::alpha("grey", alpha = 0.3)))+geom_vline(xintercept=c(-log10(0.2),log10(0.2)), col="black",linetype="dotted") +geom_hline(yintercept=0, col="black",linetype="dotted")+xlab("FDR")+theme(legend.position = "right",legend.box = "vertical") +ylab("Log2(fold-change)")+geom_text_repel(data = volcano_combined,aes(label = label,x=pval_mod, y=FC),  size = 5, box.padding = unit(0.35, "lines"),point.padding = unit(0.3, "lines"),inherit.aes = F,max.overlaps = 25)+ theme(legend.title = element_text(colour="black", size=10,face="bold"),legend.text = element_text(colour="black", size=8))+scale_shape_manual(name="Time Group",values = c(17,16),labels=c(paste0("On (Up =",table(volcano_on$FC>0)[2],"; Down =",table(volcano_on$FC>0)[1],")" ),paste0("Pre (Up =",table(volcano_pre$FC>0)[2],"; Down =",table(volcano_pre$FC>0)[1],")" )))+ylim(c(-8,10))+xlim(c(-3,9))+scale_x_continuous(limits=c(-3,6),breaks=seq(-3,6,by=2))+scale_y_continuous(limits=c(-8,10),breaks=seq(-8,10,by=2))

p1_legend <- extract_gglegend(p_preOn_gsea3)
p_preOn_gsea3<-p_preOn_gsea3+theme(legend.position='none')


###Merge plots

p_preOn_gsea4 <- plot_grid(p_yaxis,p_preOn_gsea1,p_preOn_gsea2,p_preOn_gsea3,p_xaxis,p_xaxis2,rel_heights=c(1,3),align='h',ncol=3,nrow=2,rel_widths=c(5,1,.5))

xbp_grob <- ggplotGrob(p1_legend$legend)

p_preOn_gsea_final<-p_preOn_gsea4+theme(plot.margin = unit(c(0,0,2,3), "lines"))+annotation_custom(grob = xbp_grob,xmin = .8,xmax = 1.0,ymin = 0.7,ymax = .45)+annotate("segment", x=0.29, y=0, xend=.45, yend=0,col="black", linetype="solid",arrow=arrow(length=unit(0.3, "cm")))+annotate("text", x=0.53, y=0,col="black", label="Increasing significance for On-therapy \nFDR = -Log10(padj-value)",fontface=2)+annotate("segment", xend=0.2, y=0, x=.28, yend=0,col="black", linetype="solid",arrow=arrow(length=unit(0.3, "cm")))+annotate("text", x=.12, y=0,col="black", label="Increasing significance for Pre-therapy \nFDR = +Log10(padj-value)",fontface=2)



p_preOn_gsea_final